In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("CleanedData.csv")

In [3]:
data.head()

Unnamed: 0,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Price range,Aggregate rating
0,3,12.17765,No,No,3,3.9
1,3,10.028653,No,No,2,3.5
2,2,7.163324,No,No,2,3.6
3,1,5.730659,No,No,2,4.0
4,4,14.326648,No,No,3,4.2


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [5]:
data['Has Online delivery'] = le.fit_transform(data['Has Online delivery'])
data['Has Table booking'] = le.transform(data['Has Table booking'])

In [6]:
data.head()

Unnamed: 0,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Price range,Aggregate rating
0,3,12.17765,0,0,3,3.9
1,3,10.028653,0,0,2,3.5
2,2,7.163324,0,0,2,3.6
3,1,5.730659,0,0,2,4.0
4,4,14.326648,0,0,3,4.2


### 0 = No, 1 = Yes

# Predict Table Booking based on other features

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

  from numpy.core.umath_tests import inner1d


In [8]:
GBC = GradientBoostingClassifier()

In [9]:
x = data[['Cuisines','Average Cost for two','Has Online delivery','Price range','Aggregate rating']]

In [10]:
y = data['Has Table booking']

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2)

In [12]:
GBC.fit(x_train, y_train)
pred = GBC.predict(x_test)

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[1040,   54],
       [  47,  162]], dtype=int64)

In [14]:
accuracy_score(pred, y_test)

0.9224865694551037

# Predict Online Delivery based on other features

In [15]:
GBC_2 = GradientBoostingClassifier()
x = data[['Cuisines','Average Cost for two','Has Table booking','Price range','Aggregate rating']]
y = data['Has Online delivery']
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2)
GBC_2.fit(x_train, y_train)
pred = GBC_2.predict(x_test)

In [16]:
confusion_matrix(pred, y_test)

array([[718, 269],
       [126, 190]], dtype=int64)

In [17]:
accuracy_score(pred, y_test)

0.6968534151957022

# Conclusion from above?
The conclusion we can draw is that there is some correlation between 'Has table booking' and the other features,
but 'Has Online delivery' is fairly uncorrelated to the other features

# MAIN CLASSIFIER

# IMPORTING

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Current tools - Standard Scaler, Label Encoder, Train Test Split
# Classifier - Gradient Boosting Classifier

In [19]:
x = data[['Cuisines','Average Cost for two','Has Online delivery','Price range','Has Table booking']]
y = data['Aggregate rating']

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# GETTING DIFF CLASSIFIERS

In [21]:
clfs = []
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier())

# Building  basic pipeline

In [22]:
pipeline = Pipeline([
    ('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', LogisticRegression()) #step2 - classifier
])
pipeline.steps

[('normalizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('clf',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [23]:
from sklearn.model_selection import cross_validate

scores = cross_validate(pipeline, x_train, y_train)
scores

{'fit_time': array([0.00698066, 0.0079782 , 0.00495505]),
 'score_time': array([0.00102496, 0.        , 0.0010283 ]),
 'test_score': array([0.63154865, 0.62348877, 0.63133641]),
 'train_score': array([0.63345811, 0.62597178, 0.63442717])}

In [24]:
scores['test_score'].mean()

0.6287912754568279

# Testing all classifiers

In [25]:
all_scores = []
for classifier in clfs:
    pipeline.set_params(clf = classifier)
    scores = cross_validate(pipeline, x_train, y_train)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())
    all_scores.append(scores['test_score'].mean())

---------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
-----------------------------------
test_score  mean  0.6287912754568279
test_score  std  0.0037504359658217415
score_time  mean  0.0009752114613850912
score_time  std  3.242512154364827e-05
fit_time  mean  0.006004174550374349
fit_time  std  3.4596900945266835e-05
train_score  mean  0.6312856869974769
train_score  std  0.003778267710210104
---------------------------------
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
-----------------------------------
test_score  mean  0.6900218608809617
test_score  std  0.0119580

In [26]:
print(all_scores)

[0.6287912754568279, 0.6900218608809617, 0.6900218608809617, 0.6675630556417786, 0.669672979968818, 0.6825333816182889, 0.7028783935436117]


# Found best classifier

In [27]:
# Best classifier is the Gradient Boosting Classifier
from sklearn.model_selection import GridSearchCV

In [28]:
pipeline.set_params(clf = GradientBoostingClassifier())

Pipeline(memory=None,
     steps=[('normalizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_dec...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])

# Beginning Grid Search

In [29]:
grid = GridSearchCV(pipeline, param_grid = {
    'clf__loss' : ['deviance', 'exponential'],
    'clf__learning_rate': np.linspace(0.5, 1.5, 11),
    'clf__n_estimators' : [80,100,120,140,160,180,200]
    
}, verbose = 1)
grid.fit(x_train, y_train)

Fitting 3 folds for each of 154 candidates, totalling 462 fits


[Parallel(n_jobs=1)]: Done 462 out of 462 | elapsed:  1.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('normalizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_dec...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__loss': ['deviance', 'exponential'], 'clf__learning_rate': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3, 1.4, 1.5]), 'clf__n_estimators': [80, 100, 120, 140, 160, 180, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [30]:
grid.best_params_

{'clf__learning_rate': 0.5,
 'clf__loss': 'exponential',
 'clf__n_estimators': 120}

In [31]:
grid.best_score_

0.7034548944337812

In [32]:
grid2 = GridSearchCV(pipeline, param_grid = {
    'clf__loss' : ['exponential'],
    'clf__learning_rate': np.linspace(0.1, 0.6, 11),
    'clf__n_estimators' : [80,100,120,140,160,180,200]
    
}, verbose = 1)
grid2.fit(x_train, y_train)

Fitting 3 folds for each of 77 candidates, totalling 231 fits


[Parallel(n_jobs=1)]: Done 231 out of 231 | elapsed:   48.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('normalizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_dec...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__loss': ['exponential'], 'clf__learning_rate': array([0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 ]), 'clf__n_estimators': [80, 100, 120, 140, 160, 180, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [33]:
grid2.best_params_

{'clf__learning_rate': 0.2,
 'clf__loss': 'exponential',
 'clf__n_estimators': 80}

In [34]:
grid2.best_score_

0.7057581573896353

In [35]:
grid3 = GridSearchCV(pipeline, param_grid = {
    'clf__loss' : ['exponential'],
    'clf__learning_rate': (np.linspace(0.01, 0.15, 15)),
    'clf__n_estimators' : [80,100,120,140,160,180,200, 220, 250, 300, 350]
    
}, verbose = 1)
grid3.fit(x_train, y_train)

Fitting 3 folds for each of 165 candidates, totalling 495 fits


[Parallel(n_jobs=1)]: Done 495 out of 495 | elapsed:  2.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('normalizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_dec...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__loss': ['exponential'], 'clf__learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15]), 'clf__n_estimators': [80, 100, 120, 140, 160, 180, 200, 220, 250, 300, 350]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [36]:
grid3.best_params_

{'clf__learning_rate': 0.08999999999999998,
 'clf__loss': 'exponential',
 'clf__n_estimators': 180}

In [37]:
grid3.best_score_

0.7059500959692898

In [38]:
grid4 = GridSearchCV(pipeline, param_grid = {
    'clf__loss' : ['exponential'],
    'clf__learning_rate': (np.linspace(0.05, 0.10, 15)),
    'clf__n_estimators' : [160,165, 170, 175, 180, 185, 190, 195, 200]
    
}, verbose = 1)
grid4.fit(x_train, y_train)

Fitting 3 folds for each of 135 candidates, totalling 405 fits


[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:  1.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('normalizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_dec...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__loss': ['exponential'], 'clf__learning_rate': array([0.05   , 0.05357, 0.05714, 0.06071, 0.06429, 0.06786, 0.07143,
       0.075  , 0.07857, 0.08214, 0.08571, 0.08929, 0.09286, 0.09643,
       0.1    ]), 'clf__n_estimators': [160, 165, 170, 175, 180, 185, 190, 195, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [39]:
grid4.best_params_

{'clf__learning_rate': 0.09642857142857143,
 'clf__loss': 'exponential',
 'clf__n_estimators': 195}

In [40]:
grid4.best_score_

0.7071017274472169

# Predictions w/ best classifier

In [41]:
classifier = grid4.best_estimator_

In [42]:
pred = classifier.predict(x_test)

In [43]:
accuracy = accuracy_score(pred,y_test)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

Accuracy of the best classifier after CV is 69.532%


In [44]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test, pred)

array([[699, 145],
       [252, 207]], dtype=int64)

In [45]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          0       0.74      0.83      0.78       844
          1       0.59      0.45      0.51       459

avg / total       0.68      0.70      0.68      1303

