In [None]:
! pip install Plotly  #Import the Plotly library

In [None]:
! pip install cufflinks #Import the cufflinks library

In [None]:
#import all the requires libraries
import numpy as np
import pandas as pd

import cufflinks as cf
import plotly.plotly as py
import plotly.tools as tls
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sb

import sklearn
from sklearn.preprocessing import StandardScaler,scale
from sklearn.linear_model import LinearRegression
from collections import Counter

In [None]:
#Provide the credentials from cufflink
tls.set_credentials_file(username='username', api_key='api-key') 

In [None]:
#Read CSV file
file="zomato.csv" 
zomato = pd.read_csv(file, encoding='latin-1')

#Read Country codes
country = pd.read_excel('Country-Code.xlsx') 

#Merge the files on Country Code
zomato = pd.merge(zomato, country, on='Country Code')
zomato.head(6)

In [None]:
#Select US from the available countries
df = zomato[(zomato['Country']=='United States')] 
df.shape

In [None]:
# Bar chart of the count of votes for different Rating categories
data=[go.Bar(x=zomato['Rating text'],y=zomato['Votes'])] 
layout=dict(title='Bar Chart',
           xaxis=dict(title='x-axis'),
           yaxis=dict(title='y-axis'))
py.iplot(data, filename='basic',layout=layout)

In [None]:
#Histogram of the Rating Categories
Rating=zomato['Rating text'] 
Rating.iplot(kind='histogram', filename='histogram')

In [None]:
#The geographical distribution of the aggregate ratings around the world.

data=[dict(type="scattergeo",location='World',lon=zomato['Longitude'], lat = zomato['Latitude'], 
           marker=dict(size=12,autocolorscale=False,colorscale='custom-colorscale',color=zomato['Aggregate rating'],
                      colorbar=dict(title='Rating')))]
layout= dict(title='Rating for restaurants',colorbar=True, geo = dict(scope='world',projection=dict(type='albers world'),
                                                                     showland=True,landcolor="rgb(250,250,250)",subunitcolor="rgb(217,217,217)",
                                                                     countrycolor="rgb(217,217,217)",countrywidth=0.5,subunitwidth=0.5))
fig=dict(data=data,layout=layout)

py.iplot(fig,validate=False,filename='Map')

In [None]:
#Pairplot showing the correlation between different attributes
sb.pairplot(df) 

In [None]:
#Selecting the data for Linear Regression 
zomato_data=zomato.ix[:,(17,20)].values
zomato_target=zomato.ix[:,16].values
zomato_data_names=['agg rating','Votes']
x,y= scale(zomato_data),zomato_target

In [None]:
# Linear Regression
LinReg=LinearRegression(normalize=True)
LinReg.fit(x,y)

# Liner Regression score
print(LinReg.score(x,y))

In [None]:
# The different cuisines served in all restaurants.
cuisines_data = zomato.groupby(['Cuisines'],as_index = False)['Restaurant ID'].count()

# Now time to get most popular cusinies on board, we know North Indian gonna hit the list
cuisines_data.columns = ['Popular Cusinies','Number of Restaurants']
cuisines_data.reindex(axis="index")
cuisines_data.sort_values(by='Number of Restaurants',ascending=False).head(20).reset_index(drop=True)

In [None]:
# Changing the textual input for Yes and No to 1 and 0.
df['Has Table booking'].replace({'Yes':1,'No':0},inplace=True)
df['Has Online delivery'].replace({'Yes':1,'No':0},inplace=True)
df['Switch to order menu'].replace({'Yes':1,'No':0},inplace=True)
df.head()

In [None]:
# Separating the Cuisines.
cuisines = list(set(df['Cuisines'].str.cat(sep=',').replace(" ","").split(',')))
cuisines.sort()
cuisines

In [None]:
# Creating a separate category for different cuisines.
for cuisine in cuisines:
    df[cuisine] = df['Cuisines'].str.contains(cuisine)
    df[cuisine].replace({True:1,False:0},inplace=True)
df.head()

In [None]:
# Finding the average cost for two
corr = df.corr()[['Average Cost for two']].sort_values('Average Cost for two', ascending=False)
corr[corr['Average Cost for two']>0.2]

In [None]:
# Selecting a subset
df = df[['Price range','Has Table booking','Aggregate rating','Steak','Votes','Average Cost for two']]
df.head()
df=df.dropna()

In [None]:
# Scaling the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df),columns=df.columns)
df.head()

In [None]:
# Splitting the data into train and test data
from sklearn.model_selection import train_test_split

train,test = train_test_split(df,random_state=50)
X_train = train.iloc[:,df.columns!='Average Cost for two']
X_test = test.iloc[:,df.columns!='Average Cost for two']
y_train = train['Average Cost for two']
y_test = test['Average Cost for two']
print('Training set size - ' , X_train.shape)
print('Testing set size - ' , X_test.shape)

In [None]:
#Using Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
score = r2_score(y_test,y_pred)
score

In [None]:
#Import modules for validation curve and define a function
from sklearn.model_selection import validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

def plot_validation_curve(model,param_name,x_label,param_range=np.arange(1,7)):
    train_scores,validation_scores = validation_curve(model,X_train, y_train,
                                                     param_name = param_name,param_range=param_range,
                                                     scoring='r2',cv=3)
    validation_scores[validation_scores < 0] = 0 # we are not going to plot any negative numbers!
    print('Training scores  ',train_scores.mean(axis=1))
    print('Validation scores  ',validation_scores.mean(axis=1))
    
    plt.figure(figsize=(6, 4))
    plt.plot(param_range,validation_scores.mean(axis=1),lw=2, label='validation')
    plt.plot(param_range,train_scores.mean(axis=1),lw=2, label='training')
    plt.xlabel(x_label)
    plt.ylabel('Score')
    plt.title('Validation curve')
    plt.legend(loc='best')
    plt.show()

In [None]:
model = make_pipeline(PolynomialFeatures(),LinearRegression())
plot_validation_curve(model,'polynomialfeatures__degree',x_label='Degree of polynomial')

In [None]:
from sklearn.tree import DecisionTreeRegressor
plot_validation_curve(DecisionTreeRegressor(random_state=42),'max_depth','Max Depth')