In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
from sklearn.linear_model import RidgeCV,LassoCV,LinearRegression,ElasticNet
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,BaggingRegressor,RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import r2_score
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize']=14,16
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')

In [0]:
train.head()

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49 votes,1200
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30 votes,1500
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221 votes,800
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24 votes,800
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165 votes,300


In [0]:
train.isnull().sum()

TITLE               0
RESTAURANT_ID       0
CUISINES            0
TIME                0
CITY              112
LOCALITY           98
RATING              2
VOTES            1204
COST                0
dtype: int64

In [0]:
train['CITY'].value_counts()

Chennai                  2174
Bangalore                2149
Hyderabad                1819
Mumbai                   1722
New Delhi                1321
                         ... 
Old Railway Road            1
Chennai 600034.             1
Kadavanthra                 1
Naharpar                    1
Gurgaon Haryana India       1
Name: CITY, Length: 359, dtype: int64

In [0]:
combined = train.append(test)

In [0]:
combined.isnull().sum()

CITY              147
COST             4231
CUISINES            0
LOCALITY          128
RATING              4
RESTAURANT_ID       0
TIME                0
TITLE               0
VOTES            1606
dtype: int64

In [0]:
combined.reset_index(inplace=True,drop=True)

In [0]:
combined.head(20)

Unnamed: 0,CITY,COST,CUISINES,LOCALITY,RATING,RESTAURANT_ID,TIME,TITLE,VOTES
0,Thane,1200.0,"Malwani, Goan, North Indian",Dombivali East,3.6,9438,"11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",CASUAL DINING,49 votes
1,Chennai,1500.0,"Asian, Modern Indian, Japanese",Ramapuram,4.2,13198,6pm – 11pm (Mon-Sun),"CASUAL DINING,BAR",30 votes
2,Chennai,800.0,"North Indian, Chinese, Biryani, Hyderabadi",Saligramam,3.8,10915,"11am – 3:30pm, 7pm – 11pm (Mon-Sun)",CASUAL DINING,221 votes
3,Mumbai,800.0,"Tibetan, Chinese",Bandra West,4.1,6346,11:30am – 1am (Mon-Sun),QUICK BITES,24 votes
4,Mumbai,300.0,Desserts,Lower Parel,3.8,15387,11am – 1am (Mon-Sun),DESSERT PARLOR,165 votes
5,Mumbai,800.0,"North Indian, Chinese, Seafood, Biryani",Vile Parle West,4.0,5618,12noon – 12:30AM (Mon-Sun),CASUAL DINING,550 votes
6,Bangalore,600.0,"Cafe, Chinese, Pizza, North Indian, Burger",Koramangala 6th Block,4.1,4383,"12noon – 11:30pm (Mon, Tue, Thu, Fri, Sat, Sun...",CAFÉ,509 votes
7,Gurgaon,1600.0,Bar Food,Sector 29,4.3,1249,1pm – 1am (Mon-Sun),MICROBREWERY,1612 votes
8,Bangalore,300.0,"South Indian, North Indian",Bannerghatta Road,3.9,7062,7am – 10:30pm (Mon-Sun),QUICK BITES,101 votes
9,Hyderabad,200.0,North Indian,Dilsukhnagar,3.3,10109,"11am – 2pm, 4pm – 10:30pm (Mon-Sun)",QUICK BITES,219 votes


In [0]:
combined = combined.assign(No_of_Cuisines=np.nan)
for i in range(len(combined)):
  combined["No_of_Cuisines"][i] = len(str(combined.iloc[i,2]).split(','))

In [0]:
combined['Cuisine1'] = combined['CUISINES'].str.split(',').str[0]
combined['Cuisine2'] = combined['CUISINES'].str.split(',').str[1]
combined['Cuisine3'] = combined['CUISINES'].str.split(',').str[2]
combined['Cuisine4'] = combined['CUISINES'].str.split(',').str[3]
combined['Cuisine5'] = combined['CUISINES'].str.split(',').str[4]
combined['Cuisine6'] = combined['CUISINES'].str.split(',').str[5]
combined['Cuisine7'] = combined['CUISINES'].str.split(',').str[6]
combined['Cuisine8'] = combined['CUISINES'].str.split(',').str[7]

In [0]:
combined['Cuisine1'].fillna("None",inplace=True)
combined['Cuisine2'].fillna("None",inplace=True)
combined['Cuisine3'].fillna("None",inplace=True)
combined['Cuisine4'].fillna("None",inplace=True)
combined['Cuisine5'].fillna("None",inplace=True)
combined['Cuisine6'].fillna("None",inplace=True)
combined['Cuisine7'].fillna("None",inplace=True)
combined['Cuisine8'].fillna("None",inplace=True)

In [0]:
combined['VOTES'].fillna('0 votes',inplace=True)

In [0]:
combined['VOTES'] = combined['VOTES'].str.split().str[0].astype(int)

In [0]:
combined['RATING'].value_counts()

3.9    1643
3.8    1604
4.0    1498
3.7    1437
3.6    1261
4.1    1217
3.5     988
4.2     968
NEW     930
3.4     777
4.3     740
-       672
3.3     485
4.4     484
3.2     349
4.5     274
3.1     263
2.9     242
3.0     229
2.8     191
4.6     184
2.7     129
4.7      90
2.6      85
2.5      51
4.8      49
4.9      28
2.4      24
2.3      15
2.1       4
2.2       4
2.0       2
Name: RATING, dtype: int64

In [0]:
combined['RATING'] = combined['RATING'].replace({"-":"2.5","NEW":"0.0"},regex=True)

In [0]:
combined['TIME'] = combined["TIME"].str.rstrip('.')

In [0]:
combined["TIME"] = combined['TIME'].replace({"noon":"pm","midnight":"am","AM":"am","PM":"pm"},regex=True)

In [0]:
combined['TIME'] = combined['TIME'].str.replace(r"\(.*?\)","").str.strip()

In [0]:
ls = []
for i in range(len(combined)):
  ls.append(len(str(combined.iloc[i,6]).split(',')))
max(ls)

6

In [0]:
combined['Slot1'] = combined['TIME'].str.split(',').str[0]
combined['Slot2'] = combined['TIME'].str.split(',').str[1]
combined['Slot3'] = combined['TIME'].str.split(',').str[2]
combined['Slot4'] = combined['TIME'].str.split(',').str[3]
combined['Slot5'] = combined['TIME'].str.split(',').str[4]
combined['Slot6'] = combined['TIME'].str.split(',').str[5]

In [0]:
combined.head(10)

In [0]:
ls = []
for i in range(len(combined)):
  ls.append(len(str(combined.iloc[i,7]).split(',')))

max(ls)

2

In [0]:
combined["TITLE1"] = combined["TITLE"].str.split(",").str[0]
combined["TITLE2"] = combined["TITLE"].str.split(",").str[1]

In [0]:
combined.dtypes

CITY               object
COST              float64
CUISINES           object
LOCALITY           object
RATING             object
RESTAURANT_ID       int64
TIME               object
TITLE              object
VOTES               int64
No_of_Cuisines    float64
Cuisine1           object
Cuisine2           object
Cuisine3           object
Cuisine4           object
Cuisine5           object
Cuisine6           object
Cuisine7           object
Cuisine8           object
Slot1              object
Slot2              object
Slot3              object
Slot4              object
Slot5              object
Slot6              object
TITLE1             object
TITLE2             object
dtype: object

In [0]:
cuisines = ['Cuisine1','Cuisine2','Cuisine3','Cuisine4','Cuisine5','Cuisine6','Cuisine7','Cuisine8','TITLE1','TITLE2']
combined = pd.get_dummies(combined,columns = cuisines)
combined["RATING"] = combined["RATING"].astype(float)

In [0]:
combined = combined.select_dtypes(exclude=['object'])

In [0]:
combined.drop(['RESTAURANT_ID'],axis=1,inplace=True)

In [0]:
combined

Unnamed: 0,COST,RATING,VOTES,No_of_Cuisines,Cuisine1_Afghan,Cuisine1_African,Cuisine1_American,Cuisine1_Andhra,Cuisine1_Arabian,Cuisine1_Armenian,Cuisine1_Asian,Cuisine1_Assamese,Cuisine1_Awadhi,Cuisine1_BBQ,Cuisine1_Bakery,Cuisine1_Bar Food,Cuisine1_Bengali,Cuisine1_Beverages,Cuisine1_Bihari,Cuisine1_Biryani,Cuisine1_Bohri,Cuisine1_Brazilian,Cuisine1_British,Cuisine1_Bubble Tea,Cuisine1_Burger,Cuisine1_Burmese,Cuisine1_Cafe,Cuisine1_Cafe Food,Cuisine1_Cantonese,Cuisine1_Charcoal Chicken,Cuisine1_Chettinad,Cuisine1_Chinese,Cuisine1_Coffee,Cuisine1_Continental,Cuisine1_Desserts,Cuisine1_Drinks Only,Cuisine1_Egyptian,Cuisine1_Ethiopian,Cuisine1_European,Cuisine1_Fast Food,...,TITLE1_CASUAL DINING,TITLE1_CLUB,TITLE1_COCKTAIL BAR,TITLE1_CONFECTIONERY,TITLE1_DESSERT PARLOR,TITLE1_DHABA,TITLE1_FINE DINING,TITLE1_FOOD COURT,TITLE1_FOOD TRUCK,TITLE1_IRANI CAFE,TITLE1_KIOSK,TITLE1_LOUNGE,TITLE1_MEAT SHOP,TITLE1_MESS,TITLE1_MICROBREWERY,TITLE1_None,TITLE1_PAAN SHOP,TITLE1_PUB,TITLE1_QUICK BITES,TITLE1_SWEET SHOP,TITLE2_BAKERY,TITLE2_BAR,TITLE2_BEVERAGE SHOP,TITLE2_CAFÉ,TITLE2_CASUAL DINING,TITLE2_CLUB,TITLE2_CONFECTIONERY,TITLE2_DESSERT PARLOR,TITLE2_FINE DINING,TITLE2_FOOD COURT,TITLE2_FOOD TRUCK,TITLE2_IRANI CAFE,TITLE2_KIOSK,TITLE2_LOUNGE,TITLE2_MEAT SHOP,TITLE2_MICROBREWERY,TITLE2_PAAN SHOP,TITLE2_PUB,TITLE2_QUICK BITES,TITLE2_SWEET SHOP
0,1200.0,3.6,49,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1500.0,4.2,30,3.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,800.0,3.8,221,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,800.0,4.1,24,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,300.0,3.8,165,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16916,,3.9,287,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16917,,4.3,469,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16918,,3.7,53,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16919,,2.5,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
combined.shape

(16921, 688)

In [0]:
combined.isnull().sum()

COST                   4231
RATING                    0
VOTES                     0
No_of_Cuisines            0
Cuisine1_Afghan           0
                       ... 
TITLE2_MICROBREWERY       0
TITLE2_PAAN SHOP          0
TITLE2_PUB                0
TITLE2_QUICK BITES        0
TITLE2_SWEET SHOP         0
Length: 688, dtype: int64

In [0]:
combined["RATING"].fillna(0.0,inplace=True)

In [0]:
train_data = combined[0:12690]
test_data = combined[12690:]
test_data.drop(['COST'],axis = 1,inplace = True)
train_cost = train_data['COST']
train_data.drop(['COST'],axis=1,inplace=True)

In [0]:
train_data.shape

(12690, 687)

In [0]:
inner_cv = KFold(n_splits=5,shuffle=True,random_state=42)
outer_cv = KFold(n_splits=20,shuffle=True,random_state=42)

In [0]:
def rmse(model):
  rmse = np.sqrt(-cross_val_score(model,train_data,train_cost,scoring="neg_mean_squared_error",cv=outer_cv))
  return(rmse)

In [0]:
X_train,X_val,y_train,y_val = train_test_split(train_data,train_cost,test_size=0.2,shuffle=True)

In [0]:
gbr = make_pipeline(MinMaxScaler(),GradientBoostingRegressor())
Adbr = make_pipeline(MinMaxScaler(),AdaBoostRegressor())
br = make_pipeline(MinMaxScaler(),BaggingRegressor())
dtr = make_pipeline(MinMaxScaler(),DecisionTreeRegressor())
rfr = make_pipeline(MinMaxScaler(),RandomForestRegressor())
xgbr = make_pipeline(MinMaxScaler(),XGBRegressor())
model_names = ["Gradient Boosting Regressor","AdaBoost Regressor","Bagging Regressor","Decision Tree Regressor","Random Forest Regressor","XGB Regressor"]
models = [gbr,Adbr,br,dtr,rfr,xgbr]
for name,model in zip(model_names,models):
  print("RMSE score for ",name," is : ",rmse(model).mean())
  model.fit(X_train,y_train)
  pred_views = model.predict(X_val)
  print("R^2 Score for",name,"is : ",r2_score(y_val,pred_views),"\n\n")

RMSE score for  Gradient Boosting Regressor  is :  336.62107428938737
R^2 Score for Gradient Boosting Regressor is :  0.7361135456250265 


RMSE score for  AdaBoost Regressor  is :  765.749938422067
R^2 Score for AdaBoost Regressor is :  -0.48749491795558986 


RMSE score for  Bagging Regressor  is :  349.9985925788683
R^2 Score for Bagging Regressor is :  0.7005448662153176 


RMSE score for  Decision Tree Regressor  is :  449.8938386013391
R^2 Score for Decision Tree Regressor is :  0.557574613154977 


RMSE score for  Random Forest Regressor  is :  339.720625695333
R^2 Score for Random Forest Regressor is :  0.7289332413597132 


RMSE score for  XGB Regressor  is :  335.46566439207055
R^2 Score for XGB Regressor is :  0.7357832425174846 




In [0]:
gbr.fit(train_data,train_cost)
pred_cost = gbr.predict(test_data)

(4231,)

In [0]:
My_submission = pd.DataFrame({"COST":pred_cost.astype(int)})
My_submission["COST"] = My_submission["COST"].astype(int)
My_submission.to_excel('my_submission.xlsx',index=False)

In [0]:
My_submission.columns

Index(['COST'], dtype='object')