In [1]:
print("import libraries")
print("...")
#to disable notebook warnings
import warnings
warnings.filterwarnings('ignore')

# data processing
import pandas as pd

# linear algebra
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# data standardization
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# train test split
from sklearn.model_selection import train_test_split

# ML
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR

## pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## metrics
from sklearn.metrics import mean_absolute_error, r2_score

print("...")
print("Done")
print("Libraries imported")

import libraries
...
...
Done
Libraries imported


In [2]:
### Reading data

def read_data(path):
    data = pd.read_csv(path)
    return data

def drop_columns(data, columns):
    data.drop(columns=columns, axis=1, inplace=True)

train_data = read_data('./data/train.csv')
test_data = read_data('./data/test.csv')

### dropping id column 

drop_columns(train_data, 'id')
drop_columns(test_data, 'id')

train_data.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [3]:
### features-target

X = train_data.drop(columns='Price',axis=1)
y = train_data['Price']

X.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338


In [4]:
cat_var = [col for col in X.columns if X[col].dtype == 'O']
num_var = [col for col in X.columns if X[col].dtype != 'O']

num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy="median")),
        ('scaler',StandardScaler())    
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('encoder',OneHotEncoder())
    ]
)

preprocessor = ColumnTransformer(
    [
        ("num_pipeline", num_pipeline, num_var),
        ("cat_pipeline", cat_pipeline, cat_var)
    ]
)

In [5]:
X.shape

(300000, 9)

In [6]:
X = preprocessor.fit_transform(X)
X

array([[ 0.53840844, -0.92146626,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.57619753,  1.29908617,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-1.19124004, -0.19902255,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.23026783, -0.88032993,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-1.53716974, -1.70190428,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-1.19124004,  0.07736253,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [7]:
## train test split

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.25, random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(225000, 27) (75000, 27) (225000,) (75000,)


In [8]:
### Evaluation function to give all metrics after model training

def evaluate_model(true_val, pred_val):
    mae = mean_absolute_error(true_val,pred_val)
    mse = mean_squared_error(true_val,pred_val)
    rmse = np.sqrt(mean_squared_error(true_val,pred_val))
    r2_square = r2_score(true_val,pred_val)
    return mse, rmse, r2_square

In [9]:
models ={
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random forest regressor" : RandomForestRegressor(),
    "XDBoostregressor": XGBRegressor(),
    "CatBoostregressor": CatBoostRegressor(verbose=False),
    "Adaboostregressor": AdaBoostRegressor()
}

model_list=[]
rmse_list=[]
r2_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #make predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # Evaluate train and test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_val_mae, model_val_rmse, model_val_r2 = evaluate_model(y_val, y_val_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance of training set")
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    
    print("--------------------------")

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_val_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_val_mae))    
    print("- R2 Score: {:.4f}".format(model_val_r2))
    r2_list.append(model_val_r2)
    rmse_list.append(model_val_rmse)

    print('='*35)
    print('\n')

Linear Regression
Model performance of training set
- Root Mean Squared Error: 39.0514
- Mean Absolute Error: 1525.0107
- R2 Score: 0.0011
--------------------------
Model performance for Test set
- Root Mean Squared Error: 38.9178
- Mean Absolute Error: 1514.5947
- R2 Score: 0.0011


Lasso
Model performance of training set
- Root Mean Squared Error: 39.0722
- Mean Absolute Error: 1526.6396
- R2 Score: 0.0000
--------------------------
Model performance for Test set
- Root Mean Squared Error: 38.9407
- Mean Absolute Error: 1516.3780
- R2 Score: -0.0001


Ridge
Model performance of training set
- Root Mean Squared Error: 39.0513
- Mean Absolute Error: 1525.0025
- R2 Score: 0.0011
--------------------------
Model performance for Test set
- Root Mean Squared Error: 38.9181
- Mean Absolute Error: 1514.6186
- R2 Score: 0.0011


KNN
Model performance of training set
- Root Mean Squared Error: 34.9133
- Mean Absolute Error: 1218.9392
- R2 Score: 0.2016
--------------------------
Model perform

In [10]:
pd.DataFrame(list(zip(model_list,rmse_list)),columns=['Model_Name','RMSE']).sort_values(by=["RMSE"],ascending=True)

Unnamed: 0,Model_Name,RMSE
0,Linear Regression,38.917794
2,Ridge,38.918101
8,Adaboostregressor,38.922734
1,Lasso,38.940698
7,CatBoostregressor,39.009737
6,XDBoostregressor,39.086027
5,Random forest regressor,40.099795
3,KNN,42.67875
4,Decision Tree,55.832794


In [11]:
model = LinearRegression()
model = model.fit(X,y)
test_data_copy = preprocessor.fit_transform(test_data)
y_pred = model.predict(test_data_copy)

In [12]:
y_pred

array([81.921875, 82.625   , 81.71875 , ..., 83.09375 , 81.40625 ,
       81.90625 ])

In [None]:
param_grid = [{'n_estimators':[3,10,30,40,50], 'max_features':[2,4,6]},
             {'bootstrap':[False],'n_estimators':[3,10,30,40],'max_features':[2,3,4,6]}]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg,param_grid,cv=5,
                          scoring='neg_mean_squared_error')

grid_search.fit(X,y)

In [None]:
grid_search.best_estimator_

In [None]:
results = grid_search.cv_results_

for mean_score, params in zip(results['mean_test_score'],results['params']):
    print(np.sqrt(-mean_score),params)

In [None]:
model = grid_search.best_estimator_

In [None]:
y_pred = model.predict(test_copy)

In [None]:
submission = read_data('data/sample_submission.csv')

In [None]:
submission['Price'] = np.round(y_pred,3)
submission.head()

In [None]:
submission.to_csv('output.csv',index=False)

In [None]:
models ={
        "Linear Regression": LinearRegression(),
        "KNN": KNeighborsRegressor(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random forest regressor" : RandomForestRegressor(),
        "XGBoostregressor": XGBRegressor(),
        "CatBoostregressor": CatBoostRegressor(verbose=False),
        "Adaboostregressor": AdaBoostRegressor()
        }

params={
    "Decision Tree": {
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'splitter':['best','random'],
        # 'max_features':['sqrt','log2'],
    },
    "Random forest regressor":{
        # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'max_features':['sqrt','log2',None]
        'n_estimators': [8,16,32,64,128,256]
    },
    "Gradient Boosting":{
        # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate':[.1,.01,.05,.001],
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        # 'criterion':['squared_error', 'friedman_mse'],
        # 'max_features':['auto','sqrt','log2'],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Linear Regression":{},
    "KNN":{
        'n_neighbors':[5,7,9,11]
    },
    "XGBoostregressor":{
        'learning_rate':[.1,.01,.05,.001],
        'n_estimators': [8,16,32,64,128,256]
    },
    "CatBoostregressor":{
        'depth': [6,8,10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "Adaboostregressor":{
        'learning_rate':[.1,.01,0.5,.001],
        # 'loss':['linear','square','exponential'],
        'n_estimators': [8,16,32,64,128,256]
    }
}

def evaluate_models(X_train,y_train,X_test,y_test,models,param):
    report = {}
    for i in range(len(list(models))):
        model = list(models.values())[i]
        para=param[list(models.keys())[i]]
        
        gs = GridSearchCV(model,para,cv=3)
        gs.fit(X_train,y_train)
        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)
        #make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_val)
        train_rmse = np.sqrt(mean_squared_error(y_train,y_train_pred))
        test_model_score = np.sqrt(mean_squared_error(y_val,y_test_pred))
        report[list(models.keys())[i]] = test_model_score
    return report

model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,
                                  X_test=X_val,y_test=y_val,
                                  models=models,param=params)

## To get best model score
best_model_score = max(sorted(model_report.values()))

## to get best model name
best_model_name = list(model_report.keys())[
list(model_report.values()).index(best_model_score)
]

best_model = models[best_model_name]
best_model