In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# read test file
test_data = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip')
# read training data
train = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip')

In [None]:
print(f"Shape of training data: {train.shape}\nShape of test dataset: {test_data.shape} \
    \nFeatures available: {train.columns}")

In [None]:
# To see all the columns in output this can be done.
pd.options.display.max_columns=None
# To see all rows change max_columns with max_rows

train.head()

In [None]:
train.isnull().sum().any()

There are no missing values.<br>
Last column 'revenue' is our target column.<br>
Features from P1 to P37 are all numerical features about which we know nothing specifically. So, I will directly feed them into my model.<br>
Id column is redundant, I will drop it. To avoid it we can choose it as index column while reading data.i.e., using pd.read_csv('filepath',index_col=col_name_or_positional_no)<br>
In feature Open date, I will focus on month and year and drop date values.<br>
City, City Group and Type are categorical columns.

In [None]:
train.drop('Id',axis=1,inplace=True)

In [None]:
sns.distplot(train['revenue'],hist=False)
plt.title('Distribution of Target variable')
sns.despine(); #to remove top and right spines

If we remove outliers our target variable will follow a normal distribution.(with a little bit skew)

In [None]:
train= train[train['revenue']<8e+06].copy()

In [None]:
sns.set_style('darkgrid')

# Plot for the 'train' DataFrame
plt.figure(figsize=(9, 5))
plt.subplot(1, 2, 1)
sns.countplot(data=train, x='Type')
plt.title('Train set')

# Plot for the 'test_data' DataFrame
plt.subplot(1, 2, 2)
sns.countplot(data=test_data, x='Type')
plt.title('Test set')

plt.tight_layout()
plt.show()

In [None]:
#Create a checkpoint so that we can easily access original dataset
df = pd.concat([train,test_data],axis=0)

# Extracting month and year from date column
df['Open Date'] = pd.to_datetime(df['Open Date'])
df['launch_Month'] = [x.month for x in df['Open Date']]
df['launch_year'] = [x.year for x in df['Open Date']]
df.drop(['Id','Open Date'],axis=1,inplace=True)

In [None]:
sns.countplot(data=df, x='launch_Month')
plt.title('Month-wise no of launches')
plt.show()

Second half of the year seems to witness more restaurant launches.

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(data=df, x='launch_year', order=[1996, 1997, 1998, 1999, 2000, 2002, 2004, 2005, 2006,
                                               2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014])
plt.title('Year-wise no of launches', fontsize=15)
plt.show()

With time, number of restaurants in the area is increasing due to several social factors.

# Preprocessing The Data

In [None]:
df['Type'].value_counts()

In [None]:
df['City Group'].value_counts()

In [None]:
df['City'].value_counts()

There are 63 different City values. I can create dummy variables to handle this feature but it will lead to so many columns. A good approach will be to reduce categories in this column. For example, instead of cities categories can be different zones or tier-1,tier-2 and so on. Here, I am dropping this column as there is not much information about which countries are involved. Also, the feature City Group covers effect of this feature as well.

In [None]:
df.drop('City',axis=1,inplace=True)

In [None]:
#converting other categorical columns
df['Type'] = df['Type'].map({'FC':0,'IL':1,'DT':2,'MB':3})

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df['City Group'] = encoder.fit_transform(np.array(df['City Group']).reshape(-1,1))
df['City Group'] = df['City Group'].apply(int)

In [None]:
df.dropna().groupby('launch_Month')['revenue'].mean()

Revenue is a bit higher in months of Jan, Sept and Oct.

In [None]:
df.groupby('launch_year')['revenue'].mean()

In [None]:
# creating dummy variables
df.launch_year = df.launch_year.astype(str)
df.launch_Month = df.launch_Month.astype(str)
year_dummy = pd.get_dummies(df[['launch_year','launch_Month']],drop_first=True)
df = pd.concat([df,year_dummy],axis=1)
df.drop(['launch_year','launch_Month'],axis=1, inplace=True)

In [None]:
# Re-splitting train and test data
processed_df = df.dropna(axis=0)
processed_test_data = df[128:].drop('revenue',axis=1)
# remember there were 137 rows in train data

In [None]:
processed_df.shape,processed_test_data.shape

In [None]:
processed_df.head()

# Regression Modelling

In [None]:
from sklearn.model_selection import train_test_split
X=processed_df.drop('revenue',axis=1)
y=df['revenue'][:128]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=12345)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

regressors = {
    'Linear Regression': LinearRegression(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machines': SVR(),
    'K-nearest Neighbors': KNeighborsRegressor(),
    'XGBoost': XGBRegressor()
}
results = pd.DataFrame(columns=['MAE', 'MSE', 'RMSE', 'R2-score'])
for method, func in regressors.items():
    func.fit(X_train, y_train)
    pred = func.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    results.loc[method] = [
        mean_absolute_error(y_test, pred),
        mse,
        np.sqrt(mse),  # Calculate RMSE
        r2_score(y_test, pred)
    ]
results

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'learning_rate': [.03, 0.05, .07,.09], #so called `eta` value
              'max_depth': [6,7,8,9],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500,700]}

xgb_grid = GridSearchCV(XGBRegressor(),
                        parameters,
                        cv = 3,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(X,y)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
xgb=XGBRegressor(colsample_bytree=0.7,learning_rate=0.05,max_depth=7,min_child_weight=4,
                n_estimators=500,subsample=0.7)
xgb.fit(X,y)
predicted_test_values = xgb.predict(processed_test_data)
submission1 = pd.DataFrame(columns=['Id','Prediction'])
submission1['Id'] = test_data['Id']
submission1['Prediction'] = predicted_test_values
submission1.to_csv('submission_xgb.csv',index=False)

In [None]:
knn=KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train,y_train)
predicted_test_values2 = knn.predict(processed_test_data)
submission2 = pd.DataFrame(columns=['Id','Prediction'])
submission2['Id'] = test_data['Id']
submission2['Prediction'] = predicted_test_values2
submission2.to_csv('submission_knn.csv',index=False)