# 0.0 Imports

In [17]:
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn import ensemble
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import KFold
from scipy.stats import yeojohnson
from sklearn import preprocessing

## 0.1 Load data

In [2]:
train_feature_selection = catalog.load("train_feature_selection")
validation_feature_selection = catalog.load("validation_feature_selection")
yeojohnson_transformation = catalog.load("yeojohnson_transformation")
exploratory_data_analysis = catalog.load("exploratory_data_analysis")

## 0.2 Helper Functions

- For the next cycle we can modularize the cross-validation function

In [3]:
def notebook_settings():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', 30)
    pd.set_option('display.float_format', lambda x: '%.3f' % x)

    warnings.filterwarnings('ignore')
    return None


def ml_error(model_name, y, yhat):
    mae = mean_absolute_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({'Model Name': model_name,
                         'MAE' : mae,
                         'RMSE': rmse}, index=[0])


def yeojohnson_inverse(y):
    lambda_val = yeojohnson_transformation.value
    
    y_positive_mask = y >= 0
    y_negative_mask = ~y_positive_mask

    result = np.empty_like(y)

    if lambda_val == 0:
        result[y_positive_mask] = np.exp(y[y_positive_mask]) - 1
        result[y_negative_mask] = -np.exp(-y[y_negative_mask]) - 1
    elif lambda_val != 0:
        result[y_positive_mask] = (lambda_val * y[y_positive_mask] + 1) ** (1 / lambda_val) - 1
        result[y_negative_mask] = -((-lambda_val * y[y_negative_mask] + 1) ** (1 / -lambda_val) - 1)

    return result


def cross_validation(df, model, model_name, n_splits=5):
    # Create a KFold object
    kf = KFold(n_splits=n_splits)
    # Step 1: Split data
    X = df.drop('rating', axis=1)
    y = df['rating'].copy()

    mae_list_train = []
    rmse_list_train = []
    mae_list_val = []
    rmse_list_val = []

    ss = preprocessing.StandardScaler()
    mms = preprocessing.MinMaxScaler()
    rs = preprocessing.RobustScaler()
    le = preprocessing.LabelEncoder()

    # Iterate over each train-test split
    for train_indices, val_indices in kf.split(X, y):
        train_data = df.iloc[train_indices]
        validation_data = df.iloc[val_indices]

        # Step 2: data preparation
        cols_to_standard = ['minutes']
        cols_to_min_max = ['number_of_listed_ins', 'films_per_country']
        cols_to_robust = ['release_year', 'years_since_release', 'description_length',
                        'number_of_directors', 'number_of_casts', 'number_of_countrys',
                        'films_by_director', 'films_by_cast_member', 'films_per_genre']

        for col in cols_to_standard:
            train_data[col] = ss.fit_transform(train_data[[col]].values)
            validation_data[col] = ss.transform(
                validation_data[[col]].values
            )

        for col in cols_to_min_max:
            train_data[col] = mms.fit_transform(train_data[[col]].values)
            validation_data[col] = mms.transform(
                validation_data[[col]].values
            )

        for col in cols_to_robust:
            train_data[col] = rs.fit_transform(train_data[[col]].values)
            validation_data[col] = rs.transform(
                validation_data[[col]].values
            )

        # Label - Doesn't respect an order, works well with a lot of single values
        cols_to_label = ['title', 'cast', 'description', 'movie_stage']

        for col in cols_to_label:
            train_data[col] = le.fit_transform(train_data[col])
            # Check if there is something not mapped in validation data
            validation_data[col] = validation_data[col].map(
                lambda s: "unknown" if s not in le.classes_ else s
            )
            # Add the new class 'unknown' if it is not present
            if "unknown" not in list(le.classes_):
                le.classes_ = np.append(le.classes_, "unknown")
            else:
                # The class 'unknown' is already present, ensure that it occurs only once
                le.classes_ = np.unique(le.classes_)
            validation_data[col] = le.transform(validation_data[col])

        # Ordinal - preserve the order
        duration_dict = {'short' : 1, 'medium' : 2, 'long' : 3}
        train_data['duration_bins'] = train_data['duration_bins'].map(duration_dict)
        validation_data['duration_bins'] = validation_data['duration_bins'].map(duration_dict)

        # Frequency encoding - useful for high cardinality columns
        cols_to_frequency = ['director', 'country', 'listed_in']

        for col in cols_to_frequency:
            fe_encoder = train_data.groupby(col).size() / len(train_data)
            train_data.loc[:, col] = train_data[col].map(fe_encoder)
            train_data[col] = train_data[col].astype('float64')
            
            # For validation data, replace values with their respective frequencies.
            # If the value is not found in the mapping (from train_data), replace with the average frequency from train_data.
            validation_data.loc[:, col] = validation_data[col].map(fe_encoder).fillna(fe_encoder.mean())
            validation_data[col] = validation_data[col].astype('float64')

        train_data['rating'], _ = yeojohnson(train_data['rating'])

        # Step 3: Feature selection
        cols_feature_importance = ['minutes', 'release_year', 'listed_in', 'description_length', 
                                'films_by_cast_member', 'films_per_genre', 'description', 
                                'number_of_casts', 'cast']

        train_data[list(cols_feature_importance) + ['show_id', 'date_added', 'rating']]
        validation_data[list(cols_feature_importance) + ['show_id', 'date_added', 'rating']]

        # Step 4: Model
        cols_drop = ['show_id', 'date_added', 'rating']

        # training dataset
        y_train = train_data['rating']
        x_train = train_data.drop(cols_drop, axis=1)

        # validation dataset
        y_val = validation_data['rating']
        x_val = validation_data.drop(cols_drop, axis=1)

        model.fit(x_train, y_train)

        y_hat_train = model.predict(x_train)
        y_hat_val = model.predict(x_val)

        # performance
        lr_result_train = ml_error(model_name + ' Train', yeojohnson_inverse(y_train), 
                                yeojohnson_inverse(y_hat_train))
        lr_result_val = ml_error(model_name + ' Val', y_val, 
                                yeojohnson_inverse(y_hat_val))
        
        # Appending results
        mae_list_train.append(lr_result_train['MAE'])
        rmse_list_train.append(lr_result_train['RMSE'])
        mae_list_val.append(lr_result_val['MAE'])
        rmse_list_val.append(lr_result_val['RMSE'])

    results = [
        {
            'Model name': model_name + ' Train',
            'MAE CV': np.round(np.mean(mae_list_train), 2).astype(str) + ' +/- ' + np.round(np.std(mae_list_train), 2).astype(str),
            'RMSE CV': np.round(np.mean(rmse_list_train), 2).astype(str) + ' +/- ' + np.round(np.std(rmse_list_train), 2).astype(str)
        },
        {
            'Model name': model_name + ' Val',
            'MAE CV': np.round(np.mean(mae_list_val), 2).astype(str) + ' +/- ' + np.round(np.std(mae_list_val), 2).astype(str),
            'RMSE CV': np.round(np.mean(rmse_list_val), 2).astype(str) + ' +/- ' + np.round(np.std(rmse_list_val), 2).astype(str)
        }
    ]

    # Return the final dataframe
    return pd.DataFrame(results)


notebook_settings()

# 6.0 Machine Learning Modeling

Metrics:
- For general contexts the MAPE would be an excellent metric due to its percentage calculation, however the characteristics of the data do not allow a good interpretation (typically because it contains 0 data), in this situation we will use the MAE to report to the business and the RMSE to monitor the model in order to identify the impact of outliers.

In [4]:
cols_drop = ['show_id', 'date_added', 'rating']

# training dataset
y_train = train_feature_selection['rating']
x_train = train_feature_selection.drop(cols_drop, axis=1)

# validation dataset
y_val = validation_feature_selection['rating']
x_val = validation_feature_selection.drop(cols_drop, axis=1)

## 6.1 Average model - Baseline

In [5]:
aux1 = validation_feature_selection.copy()
aux1['rating'] = y_val.copy()

# prediction - We'll do a weighted average by media category
aux2 = aux1[['listed_in', 'rating']].groupby('listed_in').mean().reset_index().rename(columns={'rating': 'predictions'})
aux1 = pd.merge(aux1, aux2, how='left', on='listed_in')
yhat_baseline = aux1['predictions']

# performance
baseline_result = ml_error('Average Model', y_val, yhat_baseline)

## 6.2 Linear Regression Model

In [6]:
# model
lr = LinearRegression().fit(x_train, y_train)

# prediction
yhat_lr_train = lr.predict(x_train)
yhat_lr = lr.predict(x_val)

# performance
lr_result_train = ml_error('Linear Regression Train', yeojohnson_inverse(y_train), 
                           yeojohnson_inverse(yhat_lr_train))
lr_result_val = ml_error('Linear Regression Val', y_val, 
                         yeojohnson_inverse(yhat_lr))

### 6.2.1 Linear Regression Model  - Cross Validation

In [7]:
lr = LinearRegression()
lr_result_cv = cross_validation(exploratory_data_analysis, lr, 'Linear Regression')

## 6.3 Linear Regression Regularized Model

In [10]:
# model
lrr = Lasso(alpha=0.01).fit(x_train, y_train)

# prediction
yhat_lrr_train = lrr.predict(x_train)
yhat_lrr = lrr.predict(x_val)

# performance
lrr_result_train = ml_error('Linear Regression Regularized Train', yeojohnson_inverse(y_train), 
                           yeojohnson_inverse(yhat_lrr_train))
lrr_result_val = ml_error('Linear Regression Regularized Val', y_val, 
                         yeojohnson_inverse(yhat_lrr))

### 6.2.1 Linear Regression Model  - Cross Validation

In [11]:
lrr = Lasso(alpha=0.01)
lrr_result_cv = cross_validation(exploratory_data_analysis, lrr, 'Linear Regression Regularized')

## 6.3 Random Forest Regressor

In [11]:
# model
rf = ensemble.RandomForestRegressor(n_jobs=-1, random_state=42).fit(x_train, y_train)

# prediction
yhat_rf_train = rf.predict(x_train)
yhat_rf = rf.predict(x_val)

# performance
rf_result_train = ml_error('Random Forest Regressor Train', yeojohnson_inverse(y_train), yeojohnson_inverse(yhat_rf_train))
rf_result_val = ml_error('Random Forest Regressor Val', y_val, yeojohnson_inverse(yhat_rf))

### 6.3.1 Random Forest Model - Cross Validation

In [12]:
rf = ensemble.RandomForestRegressor(n_jobs=-1, random_state=42)
rf_result_cv = cross_validation(exploratory_data_analysis, rf, 'Random Forest')

## 6.4 XGBoost Model

In [13]:
# model
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=42).fit(x_train, y_train)

# prediction
yhat_xgb_train = model_xgb.predict(x_train)
yhat_xgb_val = model_xgb.predict(x_val)

# performance
xgb_result_train = ml_error('XGBoost Regressor Train', yeojohnson_inverse(y_train), yeojohnson_inverse(yhat_xgb_train))
xgb_result_val = ml_error('XGBoost Regressor Val', y_val, yeojohnson_inverse(yhat_xgb_val))

### 6.4.1 XGBoost Model - Cross Validation

In [14]:
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=42)
xgb_result_cv = cross_validation(exploratory_data_analysis, model_xgb, 'XGBoost')

## 6.5 Compare Model's Performance

### 6.5.1 Single Performance

In [15]:
result = pd.concat([baseline_result, lr_result_train, lr_result_val, lrr_result_train, 
                    lrr_result_val, rf_result_train, rf_result_val, 
                    xgb_result_train, xgb_result_val])
result

Unnamed: 0,Model Name,MAE,RMSE
0,Average Model,11.322,16.58
0,Linear Regression Train,10.691,16.4
0,Linear Regression Val,10.672,16.985
0,Linear Regression Regularized Train,10.692,16.4
0,Linear Regression Regularized Val,10.673,16.986
0,Random Forest Regressor Train,4.235,7.689
0,Random Forest Regressor Val,10.83,16.277
0,XGBoost Regressor Train,3.583,6.412
0,XGBoost Regressor Val,12.005,17.879


In principle, our models are similar, with an emphasis on linear regression and random forest, but we will not be making any decisions at single performance metrics as they may contain randomization bias, we will evaluate through a technique that addresses this, cross-validation.

### 6.5.2 Real Performance - Cross Validation

In [16]:
result_cv = pd.concat([lr_result_cv, lrr_result_cv, rf_result_cv, xgb_result_cv])
result_cv

Unnamed: 0,Model name,MAE CV,RMSE CV
0,Linear Regression Train,10.33 +/- 0.33,15.91 +/- 0.41
1,Linear Regression Val,10.97 +/- 0.4,16.57 +/- 1.39
0,Linear Regression Regularized Train,10.33 +/- 0.33,15.92 +/- 0.41
1,Linear Regression Regularized Val,10.97 +/- 0.36,16.51 +/- 1.31
0,Random Forest Train,4.05 +/- 0.14,7.4 +/- 0.26
1,Random Forest Val,12.54 +/- 0.57,17.19 +/- 0.48
0,XGBoost Train,2.95 +/- 0.11,5.41 +/- 0.15
1,XGBoost Val,12.98 +/- 0.75,18.25 +/- 0.85


- We can draw the following conclusions:
1) We managed to beat our baseline, which is a weighted average
2) The best models are linear, which demonstrates the linearity of the data.
3) There is a technical tie and so we will choose linear regression to preserve the idea of simplicity. This also means that the model doesn't need Fine Tunning, so we'll skip this step.

For future cycles we can test other models that explore the data space in different ways and even use an ensemble of models.