# Backpack Prediction Challenge - Linear & XGB Regression using Pipeline

In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
import xgboost as xgb

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import root_mean_squared_error, mean_squared_error


In [2]:
df = pd.read_csv(r"G:\Data Science Eduminds\Projects\Backpack Prediction Challenge\train.csv")
df.head(10)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
5,5,Nike,Canvas,Medium,10.0,No,Yes,,Black,7.241812,20.01553
6,6,Nike,,Large,3.0,No,No,Backpack,Green,6.828123,84.805
7,7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815
8,8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652
9,9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741


In [3]:
df.shape

(300000, 11)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [5]:
df.isnull().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
# df.dropna(inplace=True)
df.shape

(300000, 11)

In [8]:
unique_val_per_feature = {}
for col in df.drop(columns=['id', 'Weight Capacity (kg)', 'Price'], axis=1).columns:
    unique_val_per_feature[col] = df[col].nunique()
    # print(f'Feature {col} has {df[col].nunique()} unique values')
unique_val_per_feature = pd.DataFrame(unique_val_per_feature.items(), columns=['Features', 'Unique Values'])
unique_val_per_feature

Unnamed: 0,Features,Unique Values
0,Brand,5
1,Material,4
2,Size,3
3,Compartments,10
4,Laptop Compartment,2
5,Waterproof,2
6,Style,3
7,Color,6


In [9]:
# Brand - OneHot Encoder
# Material - OneHot Encoder
# Size - Ordinal Encoder
# Laptop Compartment - OneHot Encoder
# Waterproof - OneHot Encoder
# Style - OneHot Encoder
# Color - OneHot Encoder
# So after applying encoading, a total will be 27 features considering dropfirst = True 

In [10]:
# We'll drop the id column as it is not required in features list.
X = df.drop(['id', 'Price'], axis=1)
y = df['Price']

# Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
class Feature_Selection(BaseEstimator, TransformerMixin):
    def __init__(self, model, top_features_threshold=0.01):
        self.model = model
        self.top_features_threshold = top_features_threshold

    def fit(self, X, y=None):
        self.model.fit(X,y)
        print(self.model.feature_importances_)
        self.indices = np.where(self.model.feature_importances_ > self.top_features_threshold)
        return self
    
    def transform(self, X):
        return np.squeeze(X[:, self.indices])

# Pipeline

In [13]:

num_features = ['Compartments', 'Weight Capacity (kg)']
cat_features = ['Brand', 'Material', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
cat_ord_features = ['Size']

num_transformer = Pipeline(steps=[
    # ('Imputer', SimpleImputer(strategy='median')),
    ('imputer', IterativeImputer()),
    ('scaler', MinMaxScaler()),
    # ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', sparse_output=False)),
    # ('Imputer', SimpleImputer(strategy='most_frequent', fill_value='None')),
    ('imputer', IterativeImputer()),
])

cat_ord_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder()),
    # ('Imputer', SimpleImputer(strategy='most_frequent', fill_value='None')),
    ('imputer', IterativeImputer()),
])

preprocessor = ColumnTransformer(transformers=[
    ('num_features', num_transformer, num_features),
    ('cat_features', cat_transformer, cat_features),
    ('cat_ord_features', cat_ord_transformer, cat_ord_features)
])

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # ('Feature Selector', Feature_Selection(model=RandomForestRegressor(n_estimators=100), top_features_threshold=0.01)),
    # ('Linear Regression', xgb.XGBRegressor(objective="reg:squarederror", n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)),
    ('Linear Regression', LinearRegression()),
])

In [14]:
# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

In [15]:
# pipe.named_steps['Feature Selector'].indices, pipe.named_steps['Feature Selector'].model.feature_importances_

In [16]:
# While predict, we should reverse the encoded target (decode the target):
y_pred = pipe.predict(X_test)
print('Root Mean Squared Error (Training Data):', root_mean_squared_error(y_test, y_pred))
# print('Root Mean Squared Error (Training Data):', np.sqrt(mean_squared_error(y_test, y_pred)))

Root Mean Squared Error (Training Data): 38.89739478819528
Root Mean Squared Error (Training Data): 38.89739478819528


In [None]:
Lasso()

In [17]:
pipe_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('Linear Regression', LinearRegression())
])
results = abs(cross_val_score(pipe_lr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1).mean())
print(f"{pipe_lr.steps[-1][0]} Mean RMSE: {results}")
y_pred_lr = pipe.predict(X_test)

Linear Regression Mean RMSE: 39.03060752373519


In [18]:
pipe_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('XGB Regressor', xgb.XGBRFRegressor())
])
results = abs(cross_val_score(pipe_xgb, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1).mean())
print(f"{pipe_xgb.steps[-1][0]} Mean RMSE: {results}")
y_pred_xgb = pipe.predict(X_test)

XGB Regressor Mean RMSE: 39.02895054462827


In [19]:
y_pred = (y_pred_lr + y_pred_xgb)/2
print('Root Mean Squared Error (Training Data):', root_mean_squared_error(y_test, y_pred))

Root Mean Squared Error (Training Data): 38.89739478819528


In [20]:
results = abs(cross_val_score(pipe, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1).mean())
print(f"{pipe.steps[-1][0]} Mean RMSE: {results}")

Linear Regression Mean RMSE: 39.03060752373519


In [21]:
# Just to check how our models performs on extra train data:

# Read extra train data:
df_train_extra = pd.read_csv(r"G:\Data Science Eduminds\Projects\Backpack Prediction Challenge\training_extra.csv")
X_extra = df_train_extra.drop(['id', 'Price'], axis=1)
y_extra = df_train_extra['Price']
X_train_extra, X_test_extra, y_train_extra, y_test_extra = train_test_split(X_extra, y_extra, test_size=0.33, random_state=42)

# Train LR model on extra train data:
pipe_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('Linear Regression', LinearRegression())
])
results = abs(cross_val_score(pipe_lr, X_train_extra, y_train_extra, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1).mean())
print(f"{pipe_lr.steps[-1][0]} Mean RMSE (on Extra data): {results}")
pipe_lr.fit(X_train_extra, y_train_extra)
y_pred_lr = pipe_lr.predict(X_test_extra)

# Train XGB model on extra train data:
pipe_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('XGB Regressor', xgb.XGBRegressor())
])
results = abs(cross_val_score(pipe_xgb, X_train_extra, y_train_extra, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1).mean())
print(f"{pipe_xgb.steps[-1][0]} Mean RMSE (on Extra data): {results}")
pipe_xgb.fit(X_train_extra, y_train_extra)
y_pred_xgb = pipe_xgb.predict(X_test_extra)

# Combine predictions by both the models and take average:
y_pred_extra = (y_pred_lr + y_pred_xgb)/2
print('Root Mean Squared Error (Combined average on Extra Training Data):', root_mean_squared_error(y_test_extra, y_pred_extra))


Linear Regression Mean RMSE: 38.90627244747142
XGB Regressor Mean RMSE: 38.89770069447225


ValueError: Found input variables with inconsistent numbers of samples: [1219125, 30000]

In [25]:
# Combine the resultsby both the models and take average to check if we get reduced rmse:
y_pred_extra = (y_pred_lr + y_pred_xgb)/2
print('Root Mean Squared Error (Extra Training Data):', root_mean_squared_error(y_test_extra, y_pred_extra))

Root Mean Squared Error (Extra Training Data): 38.90241660340683


In [None]:
Lasso()

# Now, fit the model on whole extra train data:

In [22]:
df_train_extra = pd.read_csv(r"G:\Data Science Eduminds\Projects\Backpack Prediction Challenge\training_extra.csv")
X_train_extra = df_train_extra.drop(['id', 'Price'], axis=1)
y_train_extra = df_train_extra['Price']
pipe.fit(X_train_extra, y_train_extra)

In [23]:
df_test = pd.read_csv(r"G:\Data Science Eduminds\Projects\Backpack Prediction Challenge\test.csv")
X_test = df_test.drop('id', axis=1)
y_pred_test = pipe.predict(X_test)

submission = df_test[['id']].copy()
submission['Price'] = y_pred_test

submission.head(5)
# print('Root Mean Squared Error (Test Data):', root_mean_squared_error(y_test, y_pred***))

Unnamed: 0,id,Price
0,300000,81.674293
1,300001,82.205287
2,300002,80.881794
3,300003,81.212438
4,300004,79.219115


In [24]:
submission.shape

(200000, 2)