In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train=pd.read_csv("/kaggle/input/restaurant-revenue-prediction/train.csv.zip",index_col='Id')
X_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip',index_col='Id')
df_train.shape,X_test.shape


In [None]:
train_with_missing = [col for col in df_train.columns if df_train[col].isnull().any()] 
test_with_missing = [col for col in X_test.columns if X_test[col].isnull().any()] 
train_with_missing,test_with_missing

In [None]:
y_train = df_train.revenue
X_train = df_train.drop(columns=['revenue'], axis=1)

In [None]:
X_train.shape,X_test.shape

In [None]:
import matplotlib.pyplot as plt

d_names = ('train.csv.zip', 'test.csv.zip')
y_pos = range(len(d_names))
 
plt.bar(
    y_pos, 
    (X_train.shape[0], X_test.shape[0]), 
    align='center', 
    alpha=0.8
)
plt.xticks(y_pos, d_names)
plt.ylabel('Number of rows') 
plt.title('😱 Wow!')
plt.show()

In [None]:
bad_label_cols = list(set(X_train.columns)-set(X_test.columns))
bad_label_cols

In [None]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
X_train['City'].value_counts()

In [None]:
X_train.Type.value_counts()

In [None]:
X_train['year'] = pd.DatetimeIndex(X_train['Open Date']).year
X_train.drop(columns=['Open Date','City'],inplace=True)

In [None]:
X_test['year'] = pd.DatetimeIndex(X_test['Open Date']).year
X_test.drop(columns=['Open Date','City'],inplace=True)

In [None]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [None]:
OH_X_train.head()

In [None]:
OH_X_test.head()

In [None]:
train_stats=OH_X_train.describe().transpose()
train_stats 

In [None]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(OH_X_train)
normed_test_data = norm(OH_X_test)

In [None]:
normed_test_data.head()

In [None]:
parameters = {
    'n_estimators': list(range(10, 300, 20)), 
    'learning_rate': [l / 100 for l in range(5, 100, 20)], 
    'max_depth': list(range(1, 20,3)),
    'gamma':[2,3],
    'eta':[0.8,0.9],
    'reg_alpha':[0.5,0.6,0.7,0.8],
    'reg_lambda':[0.5,0.6,0.7,0.8]
}
parameters

In [None]:
my_randome_state=70

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
gsearch = GridSearchCV(estimator=XGBRegressor(random_state=my_randome_state),
                       param_grid = parameters, 
                       scoring='neg_root_mean_squared_error',
                       n_jobs=4,cv=5, verbose=7)
gsearch.fit(normed_train_data, y_train)

In [None]:
best_n_estimators = gsearch.best_params_.get('n_estimators')

best_learning_rate = gsearch.best_params_.get('learning_rate')

best_max_depth = gsearch.best_params_.get('max_depth')

best_max_gamma = gsearch.best_params_.get('gamma')

best_max_eta = gsearch.best_params_.get('eta')

best_max_reg_alpha = gsearch.best_params_.get('reg_alpha')

best_max_reg_lambda = gsearch.best_params_.get('reg_lambda')


best_max_depth,best_n_estimators,best_learning_rate,best_max_gamma,best_max_eta,best_max_reg_alpha,best_max_reg_lambda

In [None]:
final_model = XGBRegressor(n_estimators=best_n_estimators, 
                           random_state=my_randome_state, 
                           learning_rate=best_learning_rate, 
                           max_depth=best_max_depth,
                           gamma=best_max_gamma,
                           eta=best_max_eta,
                           reg_alpha=best_max_reg_alpha,
                          reg_lambda=best_max_reg_lambda)
final_model.fit(normed_train_data, y_train)

In [None]:
preds_test = final_model.predict(normed_test_data)

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'Prediction': preds_test})
output

In [None]:
output.to_csv('submission.csv', index=False)
print('done!')