In [1]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectPercentile, chi2, mutual_info_regression, SelectFromModel
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from sklearn.neural_network import MLPRegressor

import torch

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time
from pathlib import Path


In [2]:
X1_pca = torch.load('X1_pca')
X1_tsne = torch.load('X1_tsne')
X1_ica = torch.load('X1_ica')
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue '])
X1_pca.head()

Unnamed: 0,ratings,n_votes,production_year,runtime,release_year,Action,Animation,Crime,Drama,Family,...,emb_189,emb_190,emb_191,emb_192,emb_193,emb_194,emb_195,emb_196,emb_197,emb_198
0,0.606742,0.8092,1.111354,0.39846,0.933839,0.0,0.0,0.0,1.0,0.0,...,-0.020153,-0.015117,0.237029,-0.02805,-0.023873,0.079219,-0.031364,-0.071154,-0.091889,-0.235259
1,0.764045,-0.271776,-0.073389,0.079814,1.366959,1.0,0.0,1.0,1.0,0.0,...,-0.194736,0.081447,0.099483,-0.115929,0.10783,0.129517,-0.144803,-0.000834,-0.156755,0.159004
2,0.539326,-0.256258,-1.596629,1.099479,-2.531122,0.0,0.0,0.0,1.0,1.0,...,0.092746,0.02676,0.241836,0.224205,0.007772,0.136998,0.067925,-0.050262,0.116298,0.057964
3,0.617978,-0.215474,-0.242638,0.39846,-0.798642,0.0,0.0,0.0,1.0,0.0,...,0.22216,-0.061716,0.269019,-0.029332,0.159303,0.049171,0.240762,-0.297771,-0.147129,0.116596
4,0.337079,-0.265518,-1.258132,0.494053,-2.098002,0.0,0.0,1.0,1.0,0.0,...,-0.053639,-0.288431,0.096173,-0.151598,0.179133,0.025855,0.22104,0.155626,-0.151701,-0.016146


In [3]:
X1_tsne.head()

Unnamed: 0,ratings,n_votes,production_year,runtime,release_year,Action,Animation,Crime,Drama,Family,...,Mystery,Romance,Short,Thriller,War,Western,studio_freq,emb_0,emb_1,emb_2
0,0.606742,0.8092,1.111354,0.39846,0.933839,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001695,1.008297,27.463465,-26.949211
1,0.764045,-0.271776,-0.073389,0.079814,1.366959,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000565,34.157619,-4.052936,-32.597328
2,0.539326,-0.256258,-1.596629,1.099479,-2.531122,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025141,-1.671483,-5.202857,11.800252
3,0.617978,-0.215474,-0.242638,0.39846,-0.798642,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016949,-20.894047,-3.049792,7.202212
4,0.337079,-0.265518,-1.258132,0.494053,-2.098002,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025141,3.110168,-13.494096,-25.801357


In [4]:
X_train_pca, X_train_tsne, X_train_ica, y_train = X1_pca.to_numpy(), X1_tsne.to_numpy(), X1_ica.to_numpy(), Y1.to_numpy().ravel()

## 5. Models

### 5.1 Linear regression

In [10]:
LR_model = LinearRegression()

In [28]:
# np.random.seed(42)
scores = cross_val_score(LR_model, X_train_pca, np.log(1 + y_train), scoring='r2', cv=10)
print("R^2 of the Linear Regression based on the PCA embeddings: {:.6} %".format(scores.mean()*100))
scores = cross_val_score(LR_model, X_train_pca, np.log(1 + y_train), scoring='neg_root_mean_squared_error', cv=10)
print("RMSE of the Linear Regression based on the PCA embeddings: {:.4}".format(scores.mean()))

R^2 of the Linear Regression based on the PCA embeddings: 48.4111 %
RMSE of the Linear Regression based on the PCA embeddings: -2.475


In [29]:
# np.random.seed(42)
scores = cross_val_score(LR_model, X_train_tsne, np.log(1 + y_train), scoring='r2', cv=10)
print("R^2 of the Linear Regression based on the tSNE embeddings: {:.6} %".format(scores.mean()*100))
scores = cross_val_score(LR_model, X_train_tsne, np.log(1 + y_train), scoring='neg_root_mean_squared_error', cv=10)
print("RMSE of the Linear Regression based on the tSNE embeddings: {:.4}".format(scores.mean()))

R^2 of the Linear Regression based on the tSNE embeddings: 46.7319 %
RMSE of the Linear Regression based on the tSNE embeddings: -2.514


In [30]:
scores = cross_val_score(LR_model, X_train_ica, np.log(1 + y_train), scoring='r2', cv=10)
print("R^2 of the Linear Regression based on the ICA embeddings: {:.6} %".format(scores.mean()*100))
scores = cross_val_score(LR_model, X_train_ica, np.log(1 + y_train), scoring='neg_root_mean_squared_error', cv=10)
print("RMSE of the Linear Regression based on the ICA embeddings: {:.4} ".format(scores.mean()))

R^2 of the Linear Regression based on the ICA embeddings: 48.4111 %
RMSE of the Linear Regression based on the ICA embeddings: -2.475 


### 5.2 KNN regressor

In [38]:
KNN_model = KNeighborsRegressor(n_neighbors=5)

In [39]:
scores = cross_val_score(KNN_model, X_train_pca, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.2634309997284042

In [40]:
# np.random.seed(42)
scores = cross_val_score(KNN_model, X_train_tsne, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.09042428732665879

In [41]:
scores = cross_val_score(KNN_model, X_train_ica, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.44347836955066217

### 5.3 MLP regressor

In [None]:
mlp = MLPRegressor(random_state=42, max_iter=1000, verbose=0)
hidden_layer_lists = [
    (200,),
    (200, 175,),
    (200, 175, 150,),
    (200, 175, 150, 125,),
    (200, 175, 150, 125, 100,),
    (200, 175, 150, 125, 100, 75,),
    (200, 175, 150, 125, 100, 75, 50,),
    (200, 175, 150, 125, 100, 75, 50, 25,),
    (200, 175, 150, 125, 100, 75, 50, 25, 10)
]
mlp_parameter_grid = {
    'hidden_layer_sizes': hidden_layer_lists,
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'batch_size': [177],
    'learning_rate': ['constant','adaptive'],
}

In [None]:
mlp_grid_pca = GridSearchCV(mlp, mlp_parameter_grid, cv=10, scoring='neg_root_mean_squared_error', return_train_score=True,verbose=2)
mlp_grid_ica = GridSearchCV(mlp, mlp_parameter_grid, cv=10, scoring='neg_root_mean_squared_error', return_train_score=True,verbose=2)
mlp_grid_tsne = GridSearchCV(mlp, mlp_parameter_grid, cv=10, scoring='neg_root_mean_squared_error', return_train_score=True,verbose=2)

In [None]:
## Find the best parameters for mlp regressor using three different features
# mlp_grid_pca.fit(X_train_pca, np.log(1 + y_train))
# mlp_grid_ica.fit(X_train_ica, np.log(1 + y_train))
# mlp_grid_tsne.fit(X_train_tsne, np.log(1 + y_train))

## Save the grid search results
# torch.save(mlp_grid_pca, "../models/mlp_grid_pca")
# torch.save(mlp_grid_ica, "../models/mlp_grid_ica")
# torch.save(mlp_grid_tsne, "../models/mlp_grid_tsne")

In [None]:
## Load the model selection results
mlp_grid_pca = torch.load("../models/mlp_grid_pca")
mlp_grid_ica = torch.load("../models/mlp_grid_ica")
mlp_grid_tsne = torch.load("../models/mlp_grid_tsne")

In [1]:
print("[mlp_grid_pca] The best parameters are:", mlp_grid_ica.best_params_)
print("[mlp_grid_pca] The best RMSE is:", mlp_grid_ica.best_score_)
print()
print("[mlp_grid_ica] The best parameters are:", mlp_grid_ica.best_params_)
print("[mlp_grid_ica] The best RMSE is:", mlp_grid_ica.best_score_)
print()
print("[mlp_grid_tsne] The best parameters are:", mlp_grid_ica.best_params_)
print("[mlp_grid_tsne] The best RMSE is:", mlp_grid_ica.best_score_)

NameError: name 'mlp_grid_ica' is not defined

### 5.4 Random Forest Regressor

In [5]:
RFR_model = RandomForestRegressor(n_jobs=-1)

In [6]:
scores = cross_val_score(RFR_model, X_train_pca, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.701956181677617

In [7]:
scores = cross_val_score(RFR_model, X_train_tsne, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.7291422912052102

In [8]:
scores = cross_val_score(RFR_model, X_train_ica, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.7037980851176935