In [33]:
## Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)



import pandas as pd
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [21]:
# Import df train
train_data= pd.read_csv(r'C:\Users\timothe\Documents\Documents\Python\Udemy\Pipeline\train_v9rqX0R.csv')

print(train_data.shape)
train_data.head()

(8523, 12)


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [22]:
## Feature engineering

 ## Function 1: fill in missing value of Outlet_Size with mode of Outlet_Type
def out_size( df, col_name1, col_name2):
    df[col_name1]= df.groupby([col_name2])[col_name1].apply(lambda x: x.fillna(x.mode()[0]))
    return df
    
## Function 2: fill in missing value of Item_Weight with mean of Item_Type 
def item_wgt( df, col_name1, col_name2):
    df[col_name1]= df.groupby([col_name2])[col_name1].apply(lambda x: x.fillna(x.median()))
    return df
    
## Fuction 3: Create a new column based of Item_Identifier with 3 different categories
def item_combo( df, col_name1, col_name2):
    df[col_name1]=df[col_name2].apply(lambda x: x[0:2])
    df[col_name1]= df[col_name1].map({'FD': 'Food','NC': 'Non-Consumable','DR':'Drinks'})
    return df
    
## Function 4: Creating the year to date based of Outlet_Establishment_Year
def out_year( df, col_name1, col_name2):
    df[col_name1]= 2021 - df[col_name2]
    return df
    
## Function 5: Replace by long description of Item_Fat_Content
def item_fat(df, col_name1):
    df[col_name1]= df[col_name1].replace({'LF':'Low Fat','reg':'Regular','low fat':'Low Fat'})
    return df
    


In [23]:
## Creating pipe function

df_processed= (train_data.
                  pipe(out_size, 'Outlet_Size','Outlet_Type').
                  pipe(item_wgt, 'Item_Weight','Item_Type').
                  pipe(item_combo, 'Item_Type_Combined','Item_Identifier').
                  pipe(out_year, 'Outlet_Year','Outlet_Establishment_Year').
                  pipe(item_fat, 'Item_Fat_Content')
                 )

In [24]:
## Seperate X and Y
X= train_data.drop(columns=['Item_Outlet_Sales'])

y= train_data['Item_Outlet_Sales']


### Splitting data 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [25]:
X.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined,Outlet_Year
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food,22
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,Drinks,12
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food,22
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,Food,23
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,Non-Consumable,34


In [26]:
y.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

In [27]:
### Pipeline:
# 1- Drop Item_Identifier, Outlet_Establishment_Year, Item_Type
# 2- scale Item_MRP
# 3- OneHotEncode specific features

preprocess= ColumnTransformer(remainder='passthrough',
                              transformers=[('drop_columns', 'drop', ['Item_Identifier','Outlet_Establishment_Year','Item_Type']),
                                            ('Item_MRP', StandardScaler(),['Item_MRP']),
                                            ('enc', OneHotEncoder(sparse=False, drop='first'),
                                             ['Item_Fat_Content','Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Type_Combined'])
                                           ])

preprocess

ColumnTransformer(remainder='passthrough',
                  transformers=[('drop_columns', 'drop',
                                 ['Item_Identifier',
                                  'Outlet_Establishment_Year', 'Item_Type']),
                                ('Item_MRP', StandardScaler(), ['Item_MRP']),
                                ('enc',
                                 OneHotEncoder(drop='first', sparse=False),
                                 ['Item_Fat_Content', 'Outlet_Identifier',
                                  'Outlet_Size', 'Outlet_Location_Type',
                                  'Outlet_Type', 'Item_Type_Combined'])])

In [28]:
## Create pipeline
# Run the above preprocess
# Using model = random_forest

model_pipeline= Pipeline(steps=[('preprocess',preprocess),
                                ('random_forest', RandomForestRegressor(n_estimators=50, random_state=2, max_depth=10))
    ])


In [29]:
## Fit model 

model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_columns', 'drop',
                                                  ['Item_Identifier',
                                                   'Outlet_Establishment_Year',
                                                   'Item_Type']),
                                                 ('Item_MRP', StandardScaler(),
                                                  ['Item_MRP']),
                                                 ('enc',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['Item_Fat_Content',
                                                   'Outlet_Identifier',
                                                   'Outlet_Size',
                                                   

In [30]:
# Prediction with X_train

model_pipeline.predict(X_train)

array([3168.3132053 , 3048.48879243, 2624.79012092, ..., 4122.97181122,
       1612.5373987 , 1510.14029345])

In [31]:
# Prediction with X_test 
predict= model_pipeline.predict(X_test)
predict

array([1060.51551678,  861.45173126,  693.46299819, ..., 3616.13281484,
       1946.58727227, 2503.65520107])

In [34]:
### Evaluation

print(f'RMSE for test data is: {mean_squared_error(predict, y_test)**(0.5)}')

RMSE for test data is: 1072.9585498350966


In [35]:
### Cross validation with 5 splits

cvs= cross_val_score(model_pipeline, X_train, y_train, cv=5)

print (f'All cross validation:', cvs)
print (f'Mean of all scores:', cvs.mean())

All cross validation: [0.59436914 0.56175269 0.57807533 0.60123956 0.56070264]
Mean of all scores: 0.5792278727749787


In [43]:
## compare predict and actuals

pd.DataFrame({'original_test_set': y_test, 'predictions': predict})

Unnamed: 0,original_test_set,predictions
7503,1743.0644,1060.515517
2957,356.8688,861.451731
7031,377.5086,693.462998
1084,5778.4782,5163.456393
856,2356.9320,3062.171255
...,...,...
4753,3793.7284,3619.115854
4836,2410.8618,2025.463965
8064,5309.7550,3616.132815
4418,2530.7058,1946.587272


In [36]:
## Import testing data

test_data= pd.read_csv(r'C:\Users\timothe\Documents\Documents\Python\Udemy\Pipeline\test_AbJTz2l.csv')

In [37]:
test_data.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1


In [38]:
## Runing test data through the created pip function

df_processed_test= (test_data.
                  pipe(out_size, 'Outlet_Size','Outlet_Type').
                  pipe(item_wgt, 'Item_Weight','Item_Type').
                  pipe(item_combo, 'Item_Type_Combined','Item_Identifier').
                  pipe(out_year, 'Outlet_Year','Outlet_Establishment_Year').
                  pipe(item_fat, 'Item_Fat_Content')
                 )

In [39]:
print(test_data.shape)
test_data.head(2)

(5681, 13)


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined,Outlet_Year
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food,22
1,FDW14,8.3,Regular,0.038428,Dairy,87.3198,OUT017,2007,Small,Tier 2,Supermarket Type1,Food,14


In [46]:
# predict target variables on the test data 

pred_Item_Outlet_Sales=model_pipeline.predict(test_data)
pred_Item_Outlet_Sales

array([1710.2228088 , 1355.90397758,  552.97819893, ..., 1914.45885124,
       3871.17207462, 1419.55357551])

In [48]:
## Converting the predictted outlet sales to dataframe

df_pred_Item_Outlet_Sales =pd.DataFrame(pred_Item_Outlet_Sales, columns=['Pred_Item_Outlet_Sales'])

df_pred_Item_Outlet_Sales.head()

Unnamed: 0,Pred_Item_Outlet_Sales
0,1710.222809
1,1355.903978
2,552.978199
3,2146.958593
4,6883.394582


In [49]:
## Checking shape 

print(df_pred_Item_Outlet_Sales.shape)
print(test_data.shape)

(5681, 1)
(5681, 13)


In [52]:
## Append the predicted outlet sales to the test_data

df_test= pd.merge(test_data,df_pred_Item_Outlet_Sales, how='left', on=test_data.index )

In [53]:
print(df_test.shape)
df_test.head()

(5681, 15)


Unnamed: 0,key_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined,Outlet_Year,Pred_Item_Outlet_Sales
0,0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food,22,1710.222809
1,1,FDW14,8.3,Regular,0.038428,Dairy,87.3198,OUT017,2007,Small,Tier 2,Supermarket Type1,Food,14,1355.903978
2,2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,Small,Tier 3,Grocery Store,Non-Consumable,23,552.978199
3,3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,Small,Tier 2,Supermarket Type1,Food,14,2146.958593
4,4,FDY38,12.8,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,Food,36,6883.394582


## Level 4 with Gridsearch

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [None]:
# Scale
# SMV model
steps = [('scaler', StandardScaler()), ('SVM', SVC())]

from sklearn.pipeline import Pipeline

# define the pipeline object.
pipeline = Pipeline(steps) 

In [None]:
## Split X and y
# Using stratify because of imbalance dataset

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=30, stratify=Y)

In [None]:
## Grid search parameter

parameteres = {'SVM__C':[0.001,0.1,10,100,10e5], 'SVM__gamma':[0.1,0.01]}

## Grid search
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)

In [None]:
## fit model 
grid.fit(X_train, y_train)

# Evaluation
print ("score = %3.2f" %(grid.score(X_test,y_test)))

# Best param
print (grid.best_params_)

In [None]:
## Using grid search adn SVC without pipeline

# Instantiacte and fit model
scale = StandardScaler().fit(X_train)

## Transform scale
X_train_scaled = scale.transform(X_train)

## Grid search param
parameteres = {'SVM__C':[0.001,0.1,10,100,10e5], 'SVM__gamma':[0.1,0.01]}

## Grid search
grid = GridSearchCV(SVC(), param_grid=parameteres, cv=5)

# Fit model
grid.fit(X_train_scaled, y_train)