**Goal:**

The goal of this problem is to predict the Annual Turnover of a restaurant based on the variables provided in the data set. 

**Metric to measure:**

The measure of accuracy will be RMSE (Root mean square error)

The predicted Annual Turnover for each restaurant in the Test dataset will be compared with the actual Annual Turnover to calculate the RMSE value of the entire prediction. The lower the RMSE value, the better the model will be.

**Submission File Format:**
You are to submit a  '.csv' file with exactly 500 entries plus a header row. The file should have exactly two columns

1.    Registration Number
2.    Annual Turnover

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Train_dataset_(2).csv")

In [3]:
df

Unnamed: 0,Registration Number,Annual Turnover,Cuisine,City,Restaurant Location,Opening Day of Restaurant,Facebook Popularity Quotient,Endorsed By,Instagram Popularity Quotient,Fire Audit,...,Overall Restaurant Rating,Live Music Rating,Comedy Gigs Rating,Value Deals Rating,Live Sports Rating,Ambience,Lively,Service,Comfortablility,Privacy
0,60001,42000000,"indian,irish",Bangalore,Near Business Hub,14/02/09,84.30,Not Specific,95.80,1,...,10.0,4.0,,,,8.0,8,6,6,6
1,60002,50000000,"indian,irish",Indore,Near Party Hub,29/09/08,85.40,Tier A Celebrity,85.00,1,...,9.0,,4.0,,,5.0,7,7,3,8
2,60003,32500000,"tibetan,italian",Chennai,Near Business Hub,30/07/11,85.00,Tier A Celebrity,68.20,1,...,8.0,3.0,,,,7.0,10,5,2,8
3,60004,110000000,"turkish,nigerian",Gurgaon,Near Party Hub,30/11/08,85.60,Tier A Celebrity,83.60,0,...,9.0,6.0,,,,7.0,7,4,3,5
4,60005,20000000,"irish,belgian",Manesar,Near Party Hub,22/02/10,,Tier A Celebrity,76.80,1,...,6.0,,2.0,,,,6,2,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3488,63489,40500000,"algerian,belgian",-1,Near Party Hub,20/03/09,69.10,Not Specific,62.11,1,...,9.0,5.0,,,,7.0,7,6,6,8
3489,63490,32500000,"tibetan,greek",Bangalore,Near Party Hub,05/02/12,91.00,Not Specific,96.30,1,...,,4.0,,,,4.0,9,4,0,5
3490,63491,42500000,"indian,irish",Chennai,Near Party Hub,21/05/09,80.83,Not Specific,86.80,1,...,8.0,,,,3.0,6.0,8,3,3,7
3491,63492,53000000,"japanese,thai",Bangalore,Near Party Hub,22/06/08,79.40,Not Specific,86.00,1,...,7.0,3.0,2.0,,,7.0,6,3,3,6


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def preprocess(df):
    
    
    cuisines = ['indian', "irish", "tibetan", "italian", "turkish", "nigerian", "belgian", "greek", "chinese", "salvadorian", 
            "algerian", "welsh", "thai", "peruvian", "cuban", "japanese", "british", "nigerian", "cajun", "polish", "jewish",
           "korean", "swedish", "sapnish", "hawaiian", "latvian"]
    
    cities = ['Bangalore', 'Noida', 'Hyderabad', 'Pune', 'Chennai', 'Gurgaon']
    
    celeb_dict = {'Tier A Celebrity':2, 'Local Celebrity':1, 'Not Specific':0}
    
    type_ = ['Bar', 'Caffee', 'Gastro Bar']
    
    th = ['Arabian', 'Greek', "90's"]
    
    for cuisine in cuisines:
        df["cui_"+cuisine] = df["Cuisine"].apply(lambda x: 1 if cuisine in x.lower() else 0)
        
    for city in cities:
        df["city_"+city] = df["City"].apply(lambda x: 1 if x == city else 0)
        
    df["Restaurant Location"] = df["Restaurant Location"].apply(lambda x: 1 if x=="Near Business Hub" else 0)
    
    df["Endorsed By"].replace(celeb_dict, inplace=True)
    
    df["Opening Day of Restaurant"]=(pd.to_datetime('2016-05-22 00:00:00')-pd.to_datetime(df["Opening Day of Restaurant"])).dt.days
    
    df["Facebook Popularity Quotient"].fillna(0, inplace=True)
    
    df["Instagram Popularity Quotient"].fillna(0, inplace=True)
    
    df["Resturant Tier"].fillna(0, inplace=True)
    
    for t in type_:
        df["ty_"+t] = df["Restaurant Type"].apply(lambda x: 1 if x == t else 0)
        
    for t in th:
        df["th_"+t] = df["Restaurant Theme"].apply(lambda x: 1 if x == t else 0)
        
    ratings = ['Overall Restaurant Rating', 'Live Music Rating', 'Comedy Gigs Rating',
       'Value Deals Rating', 'Live Sports Rating', "Ambience"]

    for r in ratings:
        df[r].fillna(0,inplace=True)
        
    df.drop("Cuisine", axis=1, inplace=True)
    
    df.drop("City", axis=1, inplace=True)
    
    df.drop("Restaurant Type", axis=1, inplace=True)
    
    df.drop("Restaurant Theme", axis=1, inplace=True)  
    
    ids = df["Registration Number"]
    
    df.drop("Registration Number", axis=1, inplace=True)
        
    return df

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop("Annual Turnover", axis=1), df["Annual Turnover"], test_size=0.3, random_state=42)

In [6]:
X_train_pp = preprocess(X_train)
X_test_pp = preprocess(X_test)

In [7]:
def scaling(df, train=0, sk=None):
    if train==1:
        a = sk.fit_transform(df)
        return a
    else:
        a = sk.transform(df)
        return a

In [8]:
sk = StandardScaler()
X_train_scaled = scaling(X_train_pp, 1, sk)
X_test_scaled = scaling(X_test_pp, 0, sk)

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score



class model_selection:
    
    def __init__(self, X, y, X_test, y_test):
        self.scores = []
        model_names = ["LR","XGB","DTR","ADB","RIDGE","LASSO","KNN","GBR","SVR","RFR"]
        models = [self.LR(X, y),self.XGB(X, y),self.DTR(X, y),self.ADB(X, y),self.RIDGE(X, y),self.LASSO(X, y),self.KNN(X, y),self.GBR(X, y),self.SVR(X, y),self.RFR(X, y)]
        for i in range(len(models)):
            print(model_names[i], "is starting")
            m = models[i]
            print(model_names[i], "is trained")
            self.scores_cal(m, X, y,X_test,y_test, model_names[i])
            print(model_names[i], "is scored")
            
            
        self.scores_df = pd.DataFrame(self.scores, columns=["model","train acc", "test acc", "train_rmse", "test_rmse"])
        self.models_dict = dict(zip(model_names, models))
    
    def scores_cal(self,m, X, y,X_test,y_test, mn):
        print(m)
        score = self.metric(m, X, y,X_test,y_test)
        score.insert(0, mn)
        print(score)
        self.scores.append(score)
        
    def get_scores(self):
        return self.scores_df
    
           
    def LR(self, X, y):
        lr = LinearRegression()
        lr.fit(X, y)
        return lr

    def XGB(self, X, y):
        xgb = XGBRegressor(random_state=42)
        xgb_params = {
                        'n_estimators': [100, 200, 300, 400, 500],
                        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
                        'booster': ['gbtree', 'gblinear', 'dart']
                    }
        bp = self.random_search_cv(xgb, xgb_params, X, y)
        return bp


    def DTR(self, X, y):
        dtr = DecisionTreeRegressor(random_state=42)
        dtr_params = {
                                'criterion': ['absolute_error', 'poisson', 'squared_error', 'friedman_mse'],
                                'splitter': ['best', 'random'],
                                'max_depth': [None] + list(range(1, 10)),
                                'min_samples_split': list(range(2, 10)),
                                'min_samples_leaf': list(range(1, 10))
                            }
        bp = self.random_search_cv(dtr, dtr_params, X, y)
        return bp


    def SVR(self, X, y):
        svr = SVR()
        svr_params = {
                        'C': [0.1, 1, 10, 100],
                        'kernel': ['rbf'],
                        'degree': [2, 3, 4, 5],
                        'gamma': ['scale', 'auto'],
                        'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5]
                    }
        bp = self.random_search_cv(svr, svr_params, X, y)
        return bp
    
    def ADB(self, X, y):
        adb = AdaBoostRegressor(random_state=42)
        adb_params = {
                        'n_estimators': [50, 100, 150, 200, 250, 300],
                        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
                        'loss': ['linear', 'square', 'exponential']
                    }
        bp = self.random_search_cv(adb, adb_params, X, y)
        return bp

        
    def RIDGE(self, X, y):
        rdg = Ridge()
        rdg.fit(X, y)
        return rdg
        
    def LASSO(self, X, y):
        lso = LassoCV()
        lso.fit(X, y)
        return lso
        
    def KNN(self, X, y):
        knn = KNeighborsRegressor()
        knn_params = {
                        'n_neighbors': list(range(1, 30)),
                        'weights': ['uniform', 'distance'],
                        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                        'p': [1, 2]
                    }
        bp = self.random_search_cv(knn, knn_params, X, y)
        return bp

        
    def GBR(self, X, y):
        gbr = GradientBoostingRegressor()
        
        gbr_params = {
                        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
                        'n_estimators': [100, 200, 300, 400, 500],
                        'max_depth': list(range(1, 10)),
                        'min_samples_split': list(range(2, 10)),
                        'min_samples_leaf': list(range(1, 10)),
                        'max_features': ['sqrt', 'log2']
                    }
        bp = self.random_search_cv(gbr, gbr_params, X, y)
        return bp
    
    def RFR(self, X, y):
        rf = RandomForestRegressor(random_state = 42)
        rf_params = {'bootstrap': [True, False],
                     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                     'max_features': ['log2', 'sqrt'],
                     'min_samples_leaf': [1, 2, 4],
                     'min_samples_split': [2, 5, 10],
                     'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
        bp = self.random_search_cv(rf, rf_params, X, y)
        return bp

    
    def random_search_cv(self, model, params, X, y):
        random_search = RandomizedSearchCV(model, param_distributions=params, cv=3, verbose=1,scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
        random_search.fit(X, y)
#         print("Best Parameters: ",random_search.best_params_)
        return random_search.best_estimator_
    
    def metric(self, model, X, y, X_test, y_test):
        print(model)
        train_scores = cross_val_score(model, X, y, cv=3)
        test_scores = cross_val_score(model, X_test, y_test, cv=3)
        train_rmse = mean_squared_error(y, model.predict(X), squared=False)
        test_rmse = mean_squared_error(y_test, model.predict(X_test), squared=False)
        return [np.mean(train_scores), np.mean(test_scores), train_rmse, test_rmse]
    
    def get_models(self):
        return self.models_dict
    
    def get_preds(self, m, df):
        return self.models_dict[m].predict(df)
    
        
    


    

In [10]:
model_selector = model_selection(X_train, y_train, X_test, y_test)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
LR is starting
LR is trained
LinearRegression()
LinearRegression()
['LR', 0.08509127272728671, 0.04828296055415715, 20043779.51403427, 19504077.462467767]
LR is scored
XGB is starting
XGB is trained
XGBRegressor(base_score=None, booster='gblinear', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_cons

In [11]:
model_selector.get_scores()

Unnamed: 0,model,train acc,test acc,train_rmse,test_rmse
0,LR,0.085091,0.048283,20043780.0,19504080.0
1,XGB,0.078266,0.070679,20333040.0,19552250.0
2,DTR,0.033324,-0.031706,21124130.0,20360390.0
3,ADB,0.066839,0.057013,19948060.0,19983410.0
4,RIDGE,0.087235,0.053483,20044780.0,19499400.0
5,LASSO,0.071409,0.090339,20998190.0,19883430.0
6,KNN,0.001962,-0.009822,20732710.0,20504170.0
7,GBR,0.101674,0.029538,16920100.0,19281240.0
8,SVR,-0.00164,-0.00431,21989240.0,20872580.0
9,RFR,0.114178,0.110008,11842860.0,19255450.0


In [12]:
model_selector_scaled = model_selection(X_train_scaled, y_train, X_test_scaled, y_test)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
LR is starting
LR is trained
LinearRegression()
LinearRegression()
['LR', -7.666398411174068e+17, 0.048438537372777556, 20062335.2017596, 19570629.830787413]
LR is scored
XGB is starting
XGB is trained
XGBRegressor(base_score=None, booster='gblinear', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_c

In [13]:
model_selector_scaled.get_scores()

Unnamed: 0,model,train acc,test acc,train_rmse,test_rmse
0,LR,-7.666398e+17,0.048439,20062340.0,19570630.0
1,XGB,0.0883729,0.063088,20070840.0,19466310.0
2,DTR,0.03332441,-0.030545,21124130.0,20360390.0
3,ADB,0.06491384,0.056838,19912520.0,19970730.0
4,RIDGE,0.08526794,0.048929,20043800.0,19503540.0
5,LASSO,0.1010667,0.105961,20234420.0,19390290.0
6,KNN,0.05687081,0.091945,20417250.0,20054970.0
7,GBR,0.1039576,0.026477,17073320.0,19377550.0
8,SVR,-0.001618585,-0.004302,21988850.0,20872250.0
9,RFR,0.1144712,0.110323,11841710.0,19259470.0


In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train_scaled)
evr = pca.explained_variance_ratio_

In [15]:
X_test_pca = pca.transform(X_test_scaled)

In [16]:
model_selector_pca = model_selection(X_train_pca, y_train, X_test_pca, y_test)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
LR is starting
LR is trained
LinearRegression()
LinearRegression()
['LR', 0.08718579061727871, 0.06357632602631243, 20111780.92337125, 19450039.85206186]
LR is scored
XGB is starting
XGB is trained
XGBRegressor(base_score=None, booster='gblinear', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_const

In [17]:
model_selector_pca.get_scores()

Unnamed: 0,model,train acc,test acc,train_rmse,test_rmse
0,LR,0.087186,0.063576,20111780.0,19450040.0
1,XGB,0.08893,0.067505,20119890.0,19451290.0
2,DTR,0.000332,-0.025091,21002430.0,20835650.0
3,ADB,0.011715,-0.670158,19697630.0,20565380.0
4,RIDGE,0.087241,0.063865,20111780.0,19449770.0
5,LASSO,0.093788,0.093988,20153560.0,19385410.0
6,KNN,0.045733,0.087152,20651280.0,20014780.0
7,GBR,0.064929,0.059063,17997850.0,19937410.0
8,SVR,-0.001618,-0.004302,21988840.0,20872230.0
9,RFR,0.095066,0.072827,12519980.0,19506210.0


In [18]:
models_pca = model_selector_pca.get_models()

In [19]:
pca_svr = models_pca["SVR"]

In [30]:
sub_df = pd.read_csv("Test_dataset_(2).csv")

In [21]:
sub_df.rename(columns={"Endoresed By":"Endorsed By"}, inplace=True)

In [22]:
sub_X_pp = preprocess(sub_df)

In [23]:
sub_X_scaled = scaling(sub_X_pp, 0, sk)

In [24]:
sub_X_pca = pca.transform(sub_X_scaled)

In [27]:
sub_preds = model_selector_pca.get_preds("SVR", sub_X_pca)

In [26]:
model_selector_pca.models_dict

{'LR': LinearRegression(),
 'XGB': XGBRegressor(base_score=None, booster='gblinear', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=400, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...),
 'DTR': DecisionTreeRegressor(criterion='absolute_error', max_depth=5,
                       min_samples_leaf=9, min_samples_split=8, random_state=42),
 'ADB': AdaBoostRegressor(learning_rate=0.01, random_sta

In [34]:
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Predictions":sub_preds})

In [36]:
predictions.to_csv("Submission.csv")

In [41]:
sub_preds_lasso = model_selector_pca.get_preds("LASSO", sub_X_pca)

In [42]:
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_lasso})

In [43]:
predictions.to_csv("Submission_lasso.csv",index=False)

In [44]:
sub_preds_dtr = model_selector_pca.get_preds("DTR", sub_X_pca)
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_dtr})
predictions.to_csv("Submission_dtr.csv",index=False)

In [45]:
sub_preds_rfr = model_selector_pca.get_preds("RFR", sub_X_pca)
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_rfr})
predictions.to_csv("Submission_rfr.csv",index=False)

In [46]:
sub_preds_ridge = model_selector_pca.get_preds("RIDGE", sub_X_pca)
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_ridge})
predictions.to_csv("Submission_ridge.csv",index=False)

In [47]:
sub_preds_sc_lasso = model_selector_scaled.get_preds("LASSO", sub_X_scaled)
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_sc_lasso})
predictions.to_csv("Submission_sc_lasso.csv",index=False)

In [48]:
sub_preds_sc_ridge = model_selector_scaled.get_preds("RIDGE", sub_X_scaled)
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_sc_ridge})
predictions.to_csv("Submission_sc_ridge.csv",index=False)

In [49]:
sub_preds_sc_xgb = model_selector_scaled.get_preds("XGB", sub_X_scaled)
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_sc_xgb})
predictions.to_csv("Submission_sc_xgb.csv",index=False)

In [50]:
sub_preds_ac_lasso = model_selector.get_preds("LASSO", sub_X_pp)
predictions= pd.DataFrame({"Registration Number":list(sub_df["Registration Number"]),"Annual Turnover":sub_preds_ac_lasso})
predictions.to_csv("Submission_ac_lasso.csv",index=False)

In [51]:
sub_X_pp.shape

(500, 65)

In [69]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Assuming you have 65 features
input_dim = 65

# Create a Sequential model
model = Sequential()

# Add the first layer (input layer)
model.add(Dense(128, input_dim=input_dim, activation='relu'))

# Add three hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))

# Add the output layer
# Since it's a regression problem, we use linear activation function in the output layer
model.add(Dense(1, activation='linear'))

# Compile the model
# We use mean squared error loss function for regression problems
model.compile(loss='mean_squared_error', optimizer='adam')

# Now your model is ready to be trained with your data using the fit method
model.fit(X_train_pca, y_train, epochs=50, batch_size=32)


ImportError: cannot import name 'enum_type_wrapper' from 'google.protobuf.internal' (unknown location)