In [1]:
# *** NOTE: Due to tuning and large search space, running this notebook may take considerable time. final models are saved as pickle files ***
import os
import pickle
from tqdm import tqdm # progress bar

import pandas as pd
import numpy as np
import time, copy

# visuals.
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# models
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV # hyper-parameters tuning

# metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# constant seed for reproducibility
SEED = 111 
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

# cpu workers
WORKERS = 4

# functions:
def modelEvaluation(y_val, preds):
    """ Calculates certain metrics of a predictions serie"""
    mse = mean_squared_error(y_val, preds)
    mae = mean_absolute_error(y_val, preds)
    r2 = r2_score(y_val,preds) 
    return mse,mae,r2

def measureParams(model, paramsDict):
    
    """
    measures set of performances of a model over a range of parameters
    Inputs: 
        Model - SKlearn model object
        paramsDict - Dictionary of input parameters
        
    Output:
        Nested Dictionary of results
    """
    
    results = []
    metric_list = ["MSE","MAE","R2","Time"]
    Ys_init = {}
    for i in metric_list:
        Ys_init[i] = []
    
    for k in tqdm(paramsDict.keys()):
        Xs = []
        Ys = copy.deepcopy(Ys_init)
        for v in paramsDict[k]:
            Xs.append(v)
            reg = model()
            params = {k:v}
            reg.set_params(**params)
            start_time = time.time() # Start timer
            reg.fit(X_train, y_train)
            delta = time.time() - start_time # Training time
            preds = reg.predict(X_val)
            mse,mae,r2 = modelEvaluation(y_val,preds)
            Ys['MSE'].append(mse)
            Ys['MAE'].append(mae)
            Ys['R2'].append(r2)
            Ys['Time'].append(delta)
        result = {
            "title":f"{k}_plot",
            "Xs":Xs,
            "Ys":Ys
        }
        results.append(result)
    return results

  import pandas.util.testing as tm


In [2]:
# read generated processed pickle files
modeling_data = {
    "X_train":None,
    "X_val":None,
    "y_train":None,
    "y_val":None
}
for k in modeling_data.keys():
    with open(f'./modeling_data/{k}.pickle', 'rb') as handle:
        modeling_data[k] = pickle.load(handle)

X_train = modeling_data["X_train"]
X_val = modeling_data["X_val"]
y_train = modeling_data["y_train"]
y_val = modeling_data["y_val"]

# initialise model
model_obj = {
    "name":"SVM",
    "alg":SVR # sklean model object without initialising i.e: without ()
}

In [3]:
models_perfs = [] #list of dictionaries of models' performances

model = model_obj["alg"]() # basic model with default params.
model = model.fit(X_train, y_train)

# evaluation
preds = model.predict(X_val)
mse, mae, r2 = modelEvaluation(y_val, preds)
models_perfs.append({
    "model":f"Basic {model_obj['name']}",
    "MSE":mse,
    "MAE":mae,
    "R-Squared":r2
})
print("MSE: ", mse)
print("MAE: ", mae)
print("R-Squared: ", r2)

# save basic model
with open(f"./models/{model_obj['name']}_basic.pickle", 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

MSE:  0.04633360782250399
MAE:  0.1543086073229059
R-Squared:  0.5952389686726494


NameError: name 'model_tuned' is not defined

In [None]:
# determining optimal range for tuning search space:
results = measureParams(
    model_obj["alg"],
    
    # a dictionary with keys being parameters of the model. NOTE: only numeric parameters can be plotted
    # and values being the range of that parameter to evaluate as a list. NOTE: In ascending order
    {
        "C":[1, 5, 10],
        "epsilon":[1, 0.1, 0.001],
        "gamma":[1, 1e-1, 1e-2],
        "kernel":["rbf", "linear"]
    }
)

fig, axs = plt.subplots(len(results))
for i in range(len(results)):
    axs[i].plot(results[i]["Xs"], results[i]["Ys"]['MAE'])
    axs[i].set_title(results[i]["title"])

plt.ylabel("MAE")
fig.tight_layout()
fig.tight_layout(pad=1.4)

In [None]:
# Hyper-Parameters tuning via Random Search:
alg = model_obj["alg"](random_state=SEED)
params = {
    'min_samples_leaf' : [40,100,200,500,1000],
    'min_samples_split' : [60, 100, 200, 500, 1000],
    'splitter': ["best"]
}

tuning = RandomizedSearchCV(alg, params, random_state=SEED, n_iter=10, cv=3) # 3-fold cross validation
model_tuned = tuning.fit(X_train, y_train) # fits the best performing model

# save tuned model
with open(f"./models/{model_obj['name']}_tuned.pickle", 'wb') as handle:
    pickle.dump(model_tuned, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(f'./models/{model_obj["name"]}_tuned.pickle', 'rb') as handle:
    model_tuned = pickle.load(handle)


tuned_preds = model_tuned.predict(X_val)
mse, mae, r2 = modelEvaluation(y_val, tuned_preds)
models_perfs.append({
    "model":f"Tuned {model_obj['name']}",
    "MSE":mse,
    "MAE":mae,
    "R-Squared":r2
})
print("MSE: ", mse)
print("MAE: ", mae)
print("R-Squared: ", r2)

In [None]:
models_perf_df = pd.DataFrame(models_perfs)

fig = make_subplots(rows=3, cols=1, subplot_titles=["MSE", "MAE", "R-Squared"])

fig.add_trace(go.Bar(y=models_perf_df["model"], x=models_perf_df["MSE"], orientation='h', name="MSE"), row=1, col=1)
fig.add_trace(go.Bar(y=models_perf_df["model"], x=models_perf_df["MAE"], orientation='h', name="MAE"), row=2, col=1)
fig.add_trace(go.Bar(y=models_perf_df["model"], x=models_perf_df["R-Squared"], orientation='h', name="R-Squared"), row=3, col=1)
# fig.update_xaxes(range=[0.028, 0.033], row=1, col=1)
# fig.update_xaxes(range=[0.11, 0.13], row=2, col=1)
# fig.update_xaxes(range=[0.7, 0.73], row=3, col=1)

fig.update_layout(title=f'Basic VS Tuned version of {model_obj["name"]}')
fig.show()

### Appendix
----
Learning Curve:

In [None]:
# import matplotlib.pyplot as plt
# import matplotlib.ticker as mticker
# import matplotlib.dates as mdates
# from sklearn.model_selection import ShuffleSplit, learning_curve


# print(__doc__)

# def plot_learning_curve(estimator, title, X,y ,axes=None, ylim=None, cv=None,
#                         n_jobs=None, train_sizes=np.linspace(0.1,0.9,num=9)):
#     """
#     Generate 3 plots: the test and training learning curve, the training
#     samples vs fit times curve, the fit times vs score curve.

#     Parameters
#     ----------
#     estimator : object type that implements the "fit" and "predict" methods
#         An object of that type which is cloned for each validation.

#     title : string
#         Title for the chart.

#     X : array-like, shape (n_samples, n_features)
#         Training vector, where n_samples is the number of samples and
#         n_features is the number of features.

#     y : array-like, shape (n_samples) or (n_samples, n_features), optional
#         Target relative to X for classification or regression;
#         None for unsupervised learning.

#     axes : array of 3 axes, optional (default=None)
#         Axes to use for plotting the curves.

#     ylim : tuple, shape (ymin, ymax), optional
#         Defines minimum and maximum yvalues plotted.

#     cv : int, cross-validation generator or an iterable, optional
#         Determines the cross-validation splitting strategy.
#         Possible inputs for cv are:
#           - None, to use the default 5-fold cross-validation,
#           - integer, to specify the number of folds.
#           - :term:`CV splitter`,
#           - An iterable yielding (train, test) splits as arrays of indices.

#         For integer/None inputs, if ``y`` is binary or multiclass,
#         :class:`StratifiedKFold` used. If the estimator is not a classifier
#         or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

#         Refer :ref:`User Guide <cross_validation>` for the various
#         cross-validators that can be used here.

#     n_jobs : int or None, optional (default=None)
#         Number of jobs to run in parallel.
#         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
#         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
#         for more details.

#     train_sizes : array-like, shape (n_ticks,), dtype float or int
#         Relative or absolute numbers of training examples that will be used to
#         generate the learning curve. If the dtype is float, it is regarded as a
#         fraction of the maximum size of the training set (that is determined
#         by the selected validation method), i.e. it has to be within (0, 1].
#         Otherwise it is interpreted as absolute sizes of the training sets.
#         Note that for classification the number of samples usually have to
#         be big enough to contain at least one sample from each class.
#         (default: np.linspace(0.1, 1.0, 5))
#     """
#     if axes is None:
#         _, axes = plt.subplots(1, 3, figsize=(20, 5))

#     axes[0].set_title(title)
#     if ylim is not None:
#         axes[0].set_ylim(*ylim)
#     axes[0].set_xlabel("Training examples")
#     axes[0].set_ylabel("Score")

#     train_sizes, train_scores, test_scores, fit_times, _ = \
#         learning_curve(estimator, X,y,cv=cv, n_jobs=n_jobs,
#                        train_sizes=train_sizes,
#                        return_times=True)
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)
#     fit_times_mean = np.mean(fit_times, axis=1)
#     fit_times_std = np.std(fit_times, axis=1)

#     # Plot learning curve
#     axes[0].grid()
#     axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
#                          train_scores_mean + train_scores_std, alpha=0.1,
#                          color="r")
#     axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
#                          test_scores_mean + test_scores_std, alpha=0.1,
#                          color="g")
#     axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
#                  label="Training score")
#     axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
#                  label="Cross-validation score")
#     axes[0].legend(loc="best")

#     # Plot n_samples vs fit_times
#     axes[1].grid()
#     axes[1].plot(train_sizes, fit_times_mean, 'o-')
#     axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
#                          fit_times_mean + fit_times_std, alpha=0.1)
#     axes[1].set_xlabel("Training examples")
#     axes[1].set_ylabel("fit_times")
#     axes[1].set_title("Scalability of the model")

#     # Plot fit_time vs score
#     axes[2].grid()
#     axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
#     axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
#                          test_scores_mean + test_scores_std, alpha=0.1)
#     axes[2].set_xlabel("fit_times")
#     axes[2].set_ylabel("Score")
#     axes[2].set_title("Performance of the model")

#     return plt

# # read data
# new_df = pd.read_csv('./processed_data/new_TRAIN_DF.csv')

# # features
# X = new_df.drop("price", axis=1).values

# # label
# y = new_df["price"].values
# y = np.log10(y) # transform label

# # model
# estimator = model_obj["alg"]()

# # cross-validation
# cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=SEED)

# # plot
# fig, axes = plt.subplots(3,1 , figsize=(15, 15))
# title = f"Learning Curves ({model_obj['name']})"

# # learning curve
# plot_learning_curve(estimator, title, X, y, axes=axes[:], ylim=(0.1, 1.01), cv=cv, n_jobs=WORKERS)
# plt.show()