<a href="https://colab.research.google.com/github/PashaIanko/Kaggle.Restaurant-Revenue-Prediction/blob/main/model_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Stretch tasks:
    - CV
    - best_params_
    - Short list of models
    - sklearn.metrics module

# Imports

In [15]:
from google.colab import drive
import pandas as pd
import numpy as np

In [72]:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR



# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Setup paths

In [16]:
drive.mount('/content/gdrive')
gdrive_path = '/content/gdrive/MyDrive/'
df_path = 'ML/Datasets/5.RestaurantRevenue/'

train_path = gdrive_path + df_path + "train.csv"
train_processed_path = gdrive_path + df_path + "trainval_processed.csv"
test_processed_path = gdrive_path + df_path + "test_processed.csv"
test_path =  gdrive_path + df_path + "test.csv"
submission_path = gdrive_path + df_path + "submission.csv"
sample_submission_path = gdrive_path + df_path + "sampleSubmission.csv"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Download the data

In [28]:
df_test = pd.read_csv(test_processed_path, index_col=[0])
df_test.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,-1.032819,-1.653875,0.688913,-1.382555,-0.807152,-0.633614,-2.019553,-0.062452,-0.217343,-0.244523,...,-0.606797,-0.556056,-0.652386,2008.0,0.0,0.0,1.0,0.0,1.134031,14.84228
1,0.015056,0.397991,-0.26814,-1.382555,-0.807152,-0.159426,-0.182156,-0.062452,-0.217343,-0.244523,...,0.655776,0.253599,0.511872,2007.0,0.0,0.0,0.0,1.0,1.769,15.312391


In [26]:
df_trainval = pd.read_csv(train_processed_path, index_col=[0])
df_trainval.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,-0.334236,-0.285964,-1.225193,-0.383325,0.014161,-0.633614,-0.182156,-0.062452,-0.217343,-0.244523,...,0.655776,0.253599,1.094,2009.0,0.0,0.0,0.0,1.0,2.069,15.278005
1,-0.683527,-0.285964,-1.225193,0.615905,1.656785,0.788952,-0.182156,-0.619713,-0.790338,-0.811816,...,-0.606797,-0.556056,-0.652386,2007.0,1.0,0.0,1.0,0.0,5.663,15.255171


In [29]:
X_train_val = df_trainval.values[:, :-1]
Y_train_val = df_trainval.values[:, -1]

In [31]:
X_test = df_test.values[:, :-1]
Y_test = df_test.values[:, -1]

# Select and train models

## Preliminary look (shortlisting 2-5 models)

In [67]:
def plot_model_scores(models_dict_, X_, Y_, cv_, scoring_):
    res = {}
    for name, model in models_dict_.items():
        scores = cross_val_score(
            model,
            X_,
            Y_,
            cv=cv_,
            scoring=scoring_
        )
        res[name] = scores
    
    # Sort the dict
    sorted_res = {
        k:v for \
        k, v in sorted(res.items(), key = lambda item: np.mean(item[1]))
    }
    for model_name, scores in sorted_res.items():
        print(f'Model: {model_name}, mean: {np.mean(scores)}, std: {np.std(scores)}')

    return sorted_res

In [73]:
_ = plot_model_scores(
    {
        'RF': RandomForestRegressor(),
        'LinReg': LinearRegression(),
        'Tree': DecisionTreeRegressor(),
        'Lasso': Lasso(),
        'SVM': SVR(),
        'Elastic': ElasticNet()
    },
    X_=X_train_val,
    Y_=Y_train_val,
    cv_=7,
    scoring_='neg_mean_squared_error'
)

Model: LinReg, mean: -0.3002927795320557, std: 0.0841214493207769
Model: Tree, mean: -0.2939816454239583, std: 0.07160641010456664
Model: SVM, mean: -0.23975755066794463, std: 0.10254520924351965
Model: Lasso, mean: -0.2331971333481854, std: 0.101789341582441
Model: Elastic, mean: -0.21569093211351922, std: 0.10074172443016455
Model: RF, mean: -0.19458775007617748, std: 0.06551366374469633


# Fine tuning

In [78]:
from dataclasses import dataclass

@dataclass
class Model:
    model: None
    name: None
    param_grid: None

In [80]:
RFModel = Model(
    RandomForestRegressor(),
    'RF',
    [
        {'n_estimators': [100, 200]}
    ]
)

ElasticNetModel = Model(
    ElasticNet(),
    'Elastic',
    [
        {
            'alpha': [0.1, 0.3, 0.5],
            'l1_ratio': [0.2, 0.4]
        }, 
    ]
)

In [81]:
shortlisted_models = {
    model.name: model for model in [RFModel, ElasticNetModel]
}

# Saving best models (pickle)