In [1]:
# importing the required
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading the csv file
df = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/Shopping_Revenue.csv')

In [3]:
df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,7/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753
1,1,2/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131
2,2,3/9/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379
3,3,2/2/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511
4,4,5/9/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715


In [4]:
df.shape

(137, 43)

In [5]:
# checking for all the uniquevalues in different columns
for i in df.columns:
    print(i,df[i].nunique())

Id 137
Open Date 134
City 34
City Group 2
Type 3
P1 8
P2 8
P3 8
P4 6
P5 7
P6 8
P7 6
P8 8
P9 4
P10 4
P11 8
P12 7
P13 5
P14 10
P15 8
P16 9
P17 9
P18 7
P19 9
P20 9
P21 8
P22 5
P23 9
P24 9
P25 8
P26 10
P27 9
P28 9
P29 7
P30 9
P31 10
P32 10
P33 6
P34 8
P35 8
P36 8
P37 8
revenue 137


In [6]:
# dropping the useless columns
df.drop(['Id','Open Date','City'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,Big Cities,IL,4,5.0,4.0,4.0,2,2.0,5.0,4,...,3.0,5,3,4,5,5,4,3,4,5653753
1,Big Cities,FC,4,5.0,4.0,4.0,1,2.0,5.0,5,...,3.0,0,0,0,0,0,0,0,0,6923131
2,Other,IL,2,4.0,2.0,5.0,2,3.0,5.0,5,...,3.0,0,0,0,0,0,0,0,0,2055379
3,Other,IL,6,4.5,6.0,6.0,4,4.0,10.0,8,...,7.5,25,12,10,6,18,12,12,6,2675511
4,Other,IL,3,4.0,3.0,4.0,2,2.0,5.0,5,...,3.0,5,1,3,2,3,4,3,3,4316715


In [8]:
df['Type'].unique()

array(['IL', 'FC', 'DT'], dtype=object)

In [9]:
# encoding the type column
df['City Group'] = df['City Group'].map({'Big Cities':0,'Other':1})
df['Type'] = df['Type'].map({'IL':0, 'FC':1, 'DT':2})

In [10]:
# checking if everthing is correct
df.head()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,0,4,5.0,4.0,4.0,2,2.0,5.0,4,...,3.0,5,3,4,5,5,4,3,4,5653753
1,0,1,4,5.0,4.0,4.0,1,2.0,5.0,5,...,3.0,0,0,0,0,0,0,0,0,6923131
2,1,0,2,4.0,2.0,5.0,2,3.0,5.0,5,...,3.0,0,0,0,0,0,0,0,0,2055379
3,1,0,6,4.5,6.0,6.0,4,4.0,10.0,8,...,7.5,25,12,10,6,18,12,12,6,2675511
4,1,0,3,4.0,3.0,4.0,2,2.0,5.0,5,...,3.0,5,1,3,2,3,4,3,3,4316715


In [11]:
# datatype of different columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 40 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City Group  137 non-null    int64  
 1   Type        137 non-null    int64  
 2   P1          137 non-null    int64  
 3   P2          137 non-null    float64
 4   P3          137 non-null    float64
 5   P4          137 non-null    float64
 6   P5          137 non-null    int64  
 7   P6          133 non-null    float64
 8   P7          133 non-null    float64
 9   P8          137 non-null    int64  
 10  P9          137 non-null    int64  
 11  P10         137 non-null    int64  
 12  P11         137 non-null    int64  
 13  P12         137 non-null    int64  
 14  P13         137 non-null    float64
 15  P14         137 non-null    int64  
 16  P15         137 non-null    int64  
 17  P16         137 non-null    int64  
 18  P17         137 non-null    int64  
 19  P18         137 non-null    i

In [12]:
df.isna().sum()

City Group    0
Type          0
P1            0
P2            0
P3            0
P4            0
P5            0
P6            4
P7            4
P8            0
P9            0
P10           0
P11           0
P12           0
P13           0
P14           0
P15           0
P16           0
P17           0
P18           0
P19           0
P20           0
P21           0
P22           0
P23           0
P24           0
P25           0
P26           0
P27           0
P28           0
P29           0
P30           0
P31           0
P32           0
P33           0
P34           0
P35           0
P36           0
P37           0
revenue       0
dtype: int64

In [13]:
# Since there are very little number of null values we can drop them
df.dropna(inplace=True)

In [14]:
df.describe()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
count,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,...,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0,130.0
mean,0.415385,0.569231,3.946154,4.407692,4.311538,4.376923,1.992308,3.346154,5.369231,5.130769,...,3.1,2.8,1.923077,2.492308,1.169231,2.5,2.076923,2.169231,1.115385,4482749.0
std,0.494695,0.512457,2.801748,1.480866,1.03104,1.00784,1.223137,2.089885,2.286253,1.831628,...,1.671002,5.651919,3.395426,5.120632,1.725946,5.212165,3.494437,4.073241,1.789871,2634010.0
min,0.0,0.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1149870.0
25%,0.0,0.0,2.0,4.0,4.0,4.0,1.0,2.0,5.0,4.0,...,2.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2963832.0
50%,0.0,1.0,3.0,5.0,4.0,4.0,2.0,3.0,5.0,5.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3969426.0
75%,1.0,1.0,4.0,5.0,5.0,5.0,2.0,4.0,5.0,5.0,...,3.0,4.0,3.0,3.0,2.0,3.0,4.0,3.0,2.0,5256318.0
max,1.0,2.0,12.0,7.5,7.5,7.5,8.0,10.0,10.0,10.0,...,7.5,25.0,15.0,25.0,6.0,24.0,15.0,20.0,8.0,19696940.0


In [15]:
for i in df.columns:
    print(np.var(df[i]))

0.24284023668639043
0.2605917159763318
7.789408284023673
2.176094674556212
1.0548668639053258
1.0079289940828398
1.484556213017748
4.334023668639053
5.186745562130184
3.329053254437867
3.1784615384615416
3.2317159763313557
3.7200591715976272
3.585562130177513
1.0240236686390534
7.677869822485209
5.272485207100586
11.326863905325444
4.1754437869822425
10.440236686390532
29.385562130177494
13.234082840236669
4.246390532544381
1.5200591715976353
19.467514792899397
5.140591715976328
4.288994082840229
6.493032544378696
4.290961538461533
5.1956360946745574
2.770769230769231
31.698461538461554
11.44023668639053
26.019171597633132
2.9559763313609486
26.95769230769231
12.1171597633136
16.46366863905324
3.178994082840236
6884636848067.838


In [16]:
# checking the correlation between different features
df.corr()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
City Group,1.0,0.069154,-0.224236,-0.391687,-0.042906,-0.129888,0.030945,-0.40259,-0.17093,-0.026192,...,0.07127,-0.114228,-0.179278,-0.145618,-0.092049,-0.099213,-0.121766,-0.115946,-0.080815,-0.237082
Type,0.069154,1.0,-0.156658,-0.103878,-0.220864,-0.013393,0.00704,-0.098551,-0.1874,-0.088178,...,-0.062463,-0.541176,-0.611721,-0.527108,-0.705742,-0.516601,-0.643672,-0.573861,-0.629958,0.104511
P1,-0.224236,-0.156658,1.0,0.826486,0.6821,0.659252,0.289423,0.74857,0.846637,0.743078,...,0.558331,0.518224,0.459961,0.544892,0.349766,0.582596,0.483412,0.551691,0.276405,0.076455
P2,-0.391687,-0.103878,0.826486,1.0,0.44673,0.503939,0.185775,0.665408,0.763442,0.468905,...,0.282569,0.349265,0.323876,0.366905,0.233632,0.386165,0.323457,0.367593,0.14297,0.196917
P3,-0.042906,-0.220864,0.6821,0.44673,1.0,0.419514,0.161736,0.559357,0.636495,0.674032,...,0.55658,0.48302,0.39219,0.453791,0.316461,0.500188,0.440826,0.480189,0.280714,-0.027988
P4,-0.129888,-0.013393,0.659252,0.503939,0.419514,1.0,0.725542,0.589007,0.692734,0.640787,...,0.42854,0.393025,0.309822,0.428661,0.176956,0.428693,0.281149,0.399774,0.164785,0.03185
P5,0.030945,0.00704,0.289423,0.185775,0.161736,0.725542,1.0,0.349797,0.441789,0.457195,...,0.303802,0.255442,0.128649,0.23577,0.081406,0.257174,0.185134,0.213428,0.042899,-0.031083
P6,-0.40259,-0.098551,0.74857,0.665408,0.559357,0.589007,0.349797,1.0,0.740448,0.70295,...,0.542737,0.471212,0.427644,0.479426,0.299554,0.504208,0.427285,0.490276,0.312529,0.154219
P7,-0.17093,-0.1874,0.846637,0.763442,0.636495,0.692734,0.441789,0.740448,1.0,0.738108,...,0.562474,0.527686,0.441074,0.525999,0.353374,0.567263,0.47963,0.54264,0.311551,0.05593
P8,-0.026192,-0.088178,0.743078,0.468905,0.674032,0.640787,0.457195,0.70295,0.738108,1.0,...,0.742862,0.469061,0.383047,0.492296,0.260229,0.518461,0.423528,0.479126,0.302755,-0.084394


In [17]:
# scaling the features
from sklearn.preprocessing import StandardScaler
values = df.values
ss = StandardScaler()
y_scaled = ss.fit_transform(values[:,39].reshape(-1,1))

df['revenue'] = y_scaled

In [18]:
# splitting the data into features and target
x = df.drop('revenue',axis=1)
y = df['revenue']

In [19]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [20]:
print(xtrain.shape,xtest.shape,ytrain.shape,ytest.shape)

(104, 39) (26, 39) (104,) (26,)


In [21]:
# using the Recursive feature selection method for selecting important features
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
rfe = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=10)
model = DecisionTreeRegressor()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
# evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, xtrain, ytrain, cv=cv)
# report performance
n_scores

array([-5.86714681e-01,  8.43141054e-02,  2.47721118e-01, -1.58910440e+01,
       -5.88087131e-01, -3.72598050e+01, -7.13482302e+00, -4.22642999e+00,
       -1.33167076e+00, -4.02629995e-01, -5.67465295e+00, -1.27843793e+00,
       -4.22101872e+00, -2.48521436e+01, -4.62031973e-01, -1.28352352e-01,
       -8.68162267e-01, -1.02777292e-01, -4.81291831e+00, -2.25667779e+00,
       -8.66212056e-01, -7.23562521e+00, -1.40829907e-02, -7.57368808e+00,
        9.16748871e-02, -7.21590478e-01, -3.96338641e-01, -8.72369987e+00,
       -3.76073107e-01, -5.63272541e+00])

In [22]:
print(np.mean(n_scores))

-4.77315678343339


In [23]:
# evaluating the same model for different 
n_features = [10,13,15,18,22,25,28,32]
for i in n_features:
    rfe = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=i)
    model = DecisionTreeRegressor()
    pipeline = Pipeline(steps=[('s',rfe),('m',model)])
    # evaluate model
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(pipeline, xtrain, ytrain, cv=cv,scoring='r2')
    # report performance
    print(i,np.mean(n_scores),np.std(n_scores))

10 -4.7323516590207095 7.746405694305908
13 -3.3893655638139655 4.248527209162901
15 -4.083718937677136 6.743903739832938
18 -4.614822598479967 8.058405573880915
22 -4.972022003286054 7.905962434813753
25 -4.5297242750206514 7.168168447890212
28 -4.7674789792779535 8.445350695795046
32 -3.622576222307143 4.753403564711395


In [24]:
# so here we get n_features_to_select as 22

In [25]:
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor()
# create the RFE model for the svm classifier 
# and select attributes
rfe = RFE(rfc, 22)
fs_xtrain = rfe.fit_transform(xtrain,ytrain)
# print summaries for the selection of attributes
print(rfe.support_)
print(rfe.ranking_)

[ True False  True  True  True  True  True  True False  True False  True
  True  True  True  True False False  True False  True  True  True  True
  True False  True False False  True  True False False False False False
 False False False]
[ 1  5  1  1  1  1  1  1 16  1 11  1  1  1  1  1  7 15  1  6  1  1  1  1
  1 10  1  2 17  1  1 18 14  4  9 13 12  8  3]


In [26]:
# Now we will spot check the algorithms 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
models = []
models.append(('LR',LinearRegression()))
models.append(('knn',KNeighborsRegressor()))
models.append(('dt',DecisionTreeRegressor()))
models.append(('svm',SVR()))

In [27]:
for a,b in models:
    kfold = KFold(n_splits=3, random_state=7)
    results_a = cross_val_score(b,xtrain,ytrain,scoring='neg_mean_squared_error',cv=kfold)
    print(a,np.mean(results_a))

LR -4.4116253629415985
knn -1.1126061480845455
dt -3.5970754749021303
svm -1.1177509005627058


In [28]:
for name,model in models:
    kfold = KFold(n_splits=3,random_state=1)
    results_b = cross_val_score(model,fs_xtrain,ytrain,cv=kfold,scoring = 'neg_mean_squared_error')
    print(name,np.mean(results_b))

LR -2.157820371050247
knn -1.1987492804242594
dt -2.5413133409009334
svm -1.1280915676093193


In [29]:
# the best among these is knn and svm
# now lets tune parameters using hyperparameter tuning

In [30]:
param = [1,3,5,7,9,11,13,15,17]
for i in param:
    pipeline_1 = []
    model = KNeighborsRegressor(n_neighbors=i)
#     rfe = RFE(model, n_features_to_select=28)
#     pipeline_1.append(('rfe',rfe))
#     pipeline_1.append(('knn',model))
    kfold = KFold(n_splits=10,random_state=1)
    results = cross_val_score(model,xtrain,ytrain,cv=kfold,scoring = 'neg_mean_squared_error')
    print(i,np.mean(results))

1 -2.3193620458976163
3 -1.2350684676101558
5 -1.1534204938232917
7 -1.0792102556929322
9 -1.094728331418213
11 -1.0313064883227225
13 -1.0289556569618097
15 -1.0394067965273812
17 -1.0370459990253624


In [31]:
# when n_neighbors = 13 we get a satisfactory result 

In [32]:
# Now lets try ensemble models -: Bagging, Boosting, VotingClassifier
# lets try 3 ensembkle models 2 boosting and bagging
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
ensemble = [('rfc',RandomForestRegressor()),('ab',AdaBoostRegressor()),('gb',GradientBoostingRegressor())]
for x,y in ensemble:
    kfold = KFold(n_splits=3,random_state=8)
    ensemble_result = cross_val_score(y,fs_xtrain,ytrain,cv=kfold,scoring = 'neg_mean_squared_error')
    print(x,np.mean(ensemble_result))

rfc -1.377923247431404
ab -2.2160731043362403
gb -1.7165935246956072


In [33]:
# tuning the parameter using gridsearchcv
from sklearn.model_selection import GridSearchCV
seed=7
scoring = 'neg_mean_squared_error'
param_grid = dict(n_estimators=np.array([50,75,100,125,150,200]))
model = GradientBoostingRegressor(random_state=seed)
kfold = KFold(n_splits=3, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(xtrain, ytrain)

In [34]:
grid.best_score_

-1.5770074785018544

In [35]:
grid.cv_results_['params']

[{'n_estimators': 50},
 {'n_estimators': 75},
 {'n_estimators': 100},
 {'n_estimators': 125},
 {'n_estimators': 150},
 {'n_estimators': 200}]

In [36]:
grid.cv_results_['mean_test_score']

array([-1.57700748, -1.6714763 , -1.71008544, -1.75451816, -1.7712945 ,
       -1.79392853])

In [37]:
# lets try votingregressor on our best models which are svm knn and rfc
from sklearn.ensemble import VotingRegressor
vr = VotingRegressor([('svm',SVR()),('knn',KNeighborsRegressor(n_neighbors=13))])
kfold = KFold(n_splits=3,random_state=98)
results_voting = cross_val_score(vr,fs_xtrain,ytrain,cv=kfold,scoring='neg_mean_squared_error')

In [38]:
print(np.mean(results_voting))

-1.0962770930300245


In [40]:
from sklearn.metrics import mean_squared_error

In [44]:
vr = VotingRegressor([('svm',SVR()),('knn',KNeighborsRegressor(n_neighbors=13))])
vr.fit(xtrain,ytrain)
ypred = vr.predict(xtest)

In [45]:
mean_squared_error(ytest,ypred)

0.44156733162719897