In [9]:
import json
import warnings
import pandas as pd
from helpers import Preprocessor, ModelWorker
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor

warnings.filterwarnings('ignore')

# I. Acquire Training Data

In [2]:
train_df = pd.read_csv("output/Processed_Train.csv")
test_df = pd.read_csv("output/Processed_Test.csv")
print("Dimension of training data: ", train_df.shape)
print("Dimension of test data: ", test_df.shape)

train_df.head()

Dimension of training data:  (78458, 17)
Dimension of test data:  (19615, 16)


Unnamed: 0,Unique_ID,BC_Views,BC_Comments,BC_Likes,BC_Popularity,BC_Followers,Categorical_Genre_0,Categorical_Genre_1,Categorical_Genre_2,Categorical_Genre_3,Categorical_Year_0,Categorical_Year_1,Categorical_Year_2,Categorical_Month_0,Categorical_Month_1,Categorical_Month_2,Categorical_Month_3
0,413890,12.526992,1.503982,7.03809,4.683402,20.989877,0,1,0,0,0,0,1,0,1,0,0
1,249453,9.355383,2.562027,4.228329,2.929286,11.139083,0,1,0,0,0,1,0,0,0,1,0
2,681116,11.908196,2.239518,6.445757,4.617511,16.425116,0,1,0,0,1,0,0,0,1,0,0
3,387253,16.247662,1.048771,9.103484,4.432092,24.679608,1,0,0,0,0,1,0,0,0,1,0
4,1428029,16.196768,3.674061,9.421646,6.753267,22.549205,0,1,0,0,0,1,0,0,0,0,1


# II. Fit Machine Learning Models

In [3]:
with open('output/boxcox_lambdas.json', 'r') as f:
    boxcox_lambdas = json.load(f)
print(boxcox_lambdas)

# scores = ['neg_root_mean_squared_error']
scores = ['neg_root_mean_squared_error']
features = list(test_df.columns)
features.remove('Unique_ID')
print(features)

{'Views': 0.054519416937160475, 'Comments': -0.08518700650406102, 'Likes': 0.03924831111411457, 'Popularity': 0.009233293336317148, 'Followers': 0.09194242326253005}
['BC_Comments', 'BC_Likes', 'BC_Popularity', 'BC_Followers', 'Categorical_Genre_0', 'Categorical_Genre_1', 'Categorical_Genre_2', 'Categorical_Genre_3', 'Categorical_Year_0', 'Categorical_Year_1', 'Categorical_Year_2', 'Categorical_Month_0', 'Categorical_Month_1', 'Categorical_Month_2', 'Categorical_Month_3']


## 2.1 Linear Regression

In [4]:
linear_reg_worker = ModelWorker(LinearRegression(), scores, boxcox_lambdas)
linear_reg_worker.fit(train_df[features], train_df['BC_Views'], features)

RMSE:  1.364793


## 2.2 Lasso

In [5]:
lasso_worker = ModelWorker(Lasso(), scores, boxcox_lambdas)
lasso_worker.fit(train_df[features], train_df['BC_Views'], features)

RMSE:  1.495735


In [8]:
# lasso_params = {'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1]}
lasso_params = {'alpha': [0, 0.05, 0.1, 0.15, 0.2]}
lasso_grid, best_score = lasso_worker.tune(lasso_params, train_df[features], train_df['BC_Views'], 5)

# Tuning hyper-parameters for neg_root_mean_squared_error

Best parameters set found on development set:

{'alpha': 0}

Grid scores on development set:

-1.363 (+/-0.121) for {'alpha': 0}
-1.386 (+/-0.110) for {'alpha': 0.05}
-1.427 (+/-0.093) for {'alpha': 0.1}
-1.458 (+/-0.087) for {'alpha': 0.15}
-1.464 (+/-0.087) for {'alpha': 0.2}
Elapsed time: 7.6929 seconds



## 2.3 Random Forest

In [10]:
rf_worker = ModelWorker(RandomForestRegressor(), scores, boxcox_lambdas)
rf_worker.fit(train_df[features], train_df['BC_Views'], features)

RMSE:  0.726572


# III. Predict on Test Data

In [11]:
result = rf_worker.predict(train_df, train_df['BC_Views'], test_df, features)
result['Views'] = result['Views'].round().astype(int)
result.head()

Fitted model:  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


Unnamed: 0,Unique_ID,BC_Comments,BC_Likes,BC_Popularity,BC_Followers,Categorical_Genre_0,Categorical_Genre_1,Categorical_Genre_2,Categorical_Genre_3,Categorical_Year_0,Categorical_Year_1,Categorical_Year_2,Categorical_Month_0,Categorical_Month_1,Categorical_Month_2,Categorical_Month_3,BC_Views,Views
0,562546,2.873587,10.228781,4.886136,11.510119,1,0,0,0,0,0,1,1,0,0,0,18.139937,300261
1,907584,2.604197,7.540543,5.408842,21.77202,0,1,0,0,1,0,0,0,1,0,0,13.977131,32536
2,213013,2.604197,6.712778,4.672703,17.137847,0,1,0,0,1,0,0,0,0,0,1,12.583654,14497
3,340312,0.67308,6.719291,3.037549,27.072515,1,0,0,0,1,0,0,0,0,1,0,13.828404,29896
4,41854,0.0,4.781418,0.69537,1.156013,0,0,0,1,0,0,1,0,0,1,0,10.700846,4580


In [12]:
result[['Unique_ID', 'Views']].to_excel('output/submission_20200112_1.xlsx', index=False)

In [13]:
import sklearn

sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']