# 0.0 Imports

In [1]:
import warnings
import numpy as np
import pandas as pd

from sklearn import ensemble
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import ensemble
import xgboost as xgb

## 0.1 Load data

In [2]:
train_feature_selection = catalog.load("train_feature_selection")
validation_feature_selection = catalog.load("validation_feature_selection")
yeojohnson_transformation = catalog.load("yeojohnson_transformation")

## 0.2 Helper Functions

In [3]:
def notebook_settings():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', 30)
    pd.set_option('display.float_format', lambda x: '%.3f' % x)

    warnings.filterwarnings('ignore')
    return None


def ml_error(model_name, y, yhat):
    mae = mean_absolute_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({'Model Name': model_name,
                         'MAE' : mae,
                         'RMSE': rmse}, index=[0])


def yeojohnson_inverse(y):
    lambda_val = yeojohnson_transformation.value
    
    y_positive_mask = y >= 0
    y_negative_mask = ~y_positive_mask

    result = np.empty_like(y)

    if lambda_val == 0:
        result[y_positive_mask] = np.exp(y[y_positive_mask]) - 1
        result[y_negative_mask] = -np.exp(-y[y_negative_mask]) - 1
    elif lambda_val != 0:
        result[y_positive_mask] = (lambda_val * y[y_positive_mask] + 1) ** (1 / lambda_val) - 1
        result[y_negative_mask] = -((-lambda_val * y[y_negative_mask] + 1) ** (1 / -lambda_val) - 1)

    return result


notebook_settings()

# 6.0 Machine Learning Modeling

Metrics:
- For general contexts the MAPE would be an excellent metric due to its percentage calculation, however the characteristics of the data do not allow a good interpretation (typically because it contains 0 data), in this situation we will use the MAE to report to the business and the RMSE to monitor the model in order to identify the impact of outliers.

In [4]:
cols_drop = ['show_id', 'rating']

# training dataset
y_train = train_feature_selection['rating']
x_train = train_feature_selection.drop(cols_drop, axis=1)

# validation dataset
y_val = validation_feature_selection['rating']
x_val = validation_feature_selection.drop(cols_drop, axis=1)

## 6.1 Average model - Baseline

In [5]:
aux1 = validation_feature_selection.copy()
aux1['rating'] = y_val.copy()

# prediction - We'll do a weighted average by media category
aux2 = aux1[['listed_in', 'rating']].groupby('listed_in').mean().reset_index().rename(columns={'rating': 'predictions'})
aux1 = pd.merge(aux1, aux2, how='left', on='listed_in')
yhat_baseline = aux1['predictions']

# performance
baseline_result = ml_error('Average Model', y_val, yhat_baseline)

### 6.1.1 Average model  - Cross Validation

In [6]:
#TODO

## 6.2 Linear Regression Model

In [7]:
# model
lr = LinearRegression().fit(x_train, y_train)

# prediction
yhat_lr_train = lr.predict(x_train)
yhat_lr = lr.predict(x_val)

# performance
lr_result_train = ml_error('Linear Regression Train', yeojohnson_inverse(y_train), 
                           yeojohnson_inverse(yhat_lr_train))
lr_result_val = ml_error('Linear Regression Val', y_val, 
                         yeojohnson_inverse(yhat_lr))

### 6.2.1 Linear Regression Model  - Cross Validation

In [8]:
#TODO

## 6.3 Random Forest Regressor

In [9]:
# model
rf = ensemble.RandomForestRegressor(n_jobs=-1, random_state=42).fit(x_train, y_train)

# prediction
yhat_rf_train = rf.predict(x_train)
yhat_rf = rf.predict(x_val)

# performance
rf_result_train = ml_error('Random Forest Regressor Train', yeojohnson_inverse(y_train), yeojohnson_inverse(yhat_rf_train))
rf_result_val = ml_error('Random Forest Regressor Val', y_val, yeojohnson_inverse(yhat_rf))

### 6.3.1 Random Forest Model - Cross Validation

In [10]:
#TODO

## 6.4 XGBoost Model

In [11]:
# model
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=42).fit(x_train, y_train)

# prediction
yhat_xgb_train = model_xgb.predict(x_train)
yhat_xgb_val = model_xgb.predict(x_val)

# performance
xgb_result_train = ml_error('XGBoost Regressor Train', yeojohnson_inverse(y_train), yeojohnson_inverse(yhat_xgb_train))
xgb_result_val = ml_error('XGBoost Regressor Val', y_val, yeojohnson_inverse(yhat_xgb_val))

### 6.4.1 XGBoost Model - Cross Validation

In [12]:
# TODO

## 6.5 Compare Model's Performance

### 6.5.1 Single Performance

In [13]:
result = pd.concat([baseline_result, lr_result_train, lr_result_val, rf_result_train, rf_result_val, 
                    xgb_result_train, xgb_result_val])
result

Unnamed: 0,Model Name,MAE,RMSE
0,Average Model,9.952,14.929
0,Linear Regression Train,11.481,17.4
0,Linear Regression Val,11.156,17.141
0,Random Forest Regressor Train,4.495,8.007
0,Random Forest Regressor Val,15.271,19.274
0,XGBoost Regressor Train,4.814,8.366
0,XGBoost Regressor Val,13.914,19.098


### 6.5.2 Real Performance - Cross Validation

In [14]:
# TODO