In [1]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectPercentile, chi2, mutual_info_regression, SelectFromModel
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from sklearn.neural_network import MLPRegressor

import torch

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time
from pathlib import Path


In [2]:
X1_pca = torch.load('X1_pca')
X1_tsne = torch.load('X1_tsne')
X1_ica = torch.load('X1_ica')
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue '])
X1_pca.head()

Unnamed: 0,ratings,n_votes,production_year,runtime,release_year,Action,Animation,Crime,Drama,Family,...,emb_189,emb_190,emb_191,emb_192,emb_193,emb_194,emb_195,emb_196,emb_197,emb_198
0,0.606742,0.8092,1.111354,0.39846,0.933839,0.0,0.0,0.0,1.0,0.0,...,-0.020153,-0.015117,0.237029,-0.02805,-0.023873,0.079219,-0.031364,-0.071154,-0.091889,-0.235259
1,0.764045,-0.271776,-0.073389,0.079814,1.366959,1.0,0.0,1.0,1.0,0.0,...,-0.194736,0.081447,0.099483,-0.115929,0.10783,0.129517,-0.144803,-0.000834,-0.156755,0.159004
2,0.539326,-0.256258,-1.596629,1.099479,-2.531122,0.0,0.0,0.0,1.0,1.0,...,0.092746,0.02676,0.241836,0.224205,0.007772,0.136998,0.067925,-0.050262,0.116298,0.057964
3,0.617978,-0.215474,-0.242638,0.39846,-0.798642,0.0,0.0,0.0,1.0,0.0,...,0.22216,-0.061716,0.269019,-0.029332,0.159303,0.049171,0.240762,-0.297771,-0.147129,0.116596
4,0.337079,-0.265518,-1.258132,0.494053,-2.098002,0.0,0.0,1.0,1.0,0.0,...,-0.053639,-0.288431,0.096173,-0.151598,0.179133,0.025855,0.22104,0.155626,-0.151701,-0.016146


In [3]:
X1_tsne.head()

Unnamed: 0,ratings,n_votes,production_year,runtime,release_year,Action,Animation,Crime,Drama,Family,...,Mystery,Romance,Short,Thriller,War,Western,studio_freq,emb_0,emb_1,emb_2
0,0.606742,0.8092,1.111354,0.39846,0.933839,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001695,1.008297,27.463465,-26.949211
1,0.764045,-0.271776,-0.073389,0.079814,1.366959,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000565,34.157619,-4.052936,-32.597328
2,0.539326,-0.256258,-1.596629,1.099479,-2.531122,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025141,-1.671483,-5.202857,11.800252
3,0.617978,-0.215474,-0.242638,0.39846,-0.798642,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016949,-20.894047,-3.049792,7.202212
4,0.337079,-0.265518,-1.258132,0.494053,-2.098002,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025141,3.110168,-13.494096,-25.801357


In [4]:
X_train_pca, X_train_tsne, X_train_ica, y_train = X1_pca.to_numpy(), X1_tsne.to_numpy(), X1_ica.to_numpy(), Y1.to_numpy().ravel()

## 5. Models

### 5.1 Linear regression

In [10]:
LR_model = LinearRegression()

In [28]:
# np.random.seed(42)
scores = cross_val_score(LR_model, X_train_pca, np.log(1 + y_train), scoring='r2', cv=10)
print("R^2 of the Linear Regression based on the PCA embeddings: {:.6} %".format(scores.mean()*100))
scores = cross_val_score(LR_model, X_train_pca, np.log(1 + y_train), scoring='neg_root_mean_squared_error', cv=10)
print("RMSE of the Linear Regression based on the PCA embeddings: {:.4}".format(scores.mean()))

R^2 of the Linear Regression based on the PCA embeddings: 48.4111 %
RMSE of the Linear Regression based on the PCA embeddings: -2.475


In [29]:
# np.random.seed(42)
scores = cross_val_score(LR_model, X_train_tsne, np.log(1 + y_train), scoring='r2', cv=10)
print("R^2 of the Linear Regression based on the tSNE embeddings: {:.6} %".format(scores.mean()*100))
scores = cross_val_score(LR_model, X_train_tsne, np.log(1 + y_train), scoring='neg_root_mean_squared_error', cv=10)
print("RMSE of the Linear Regression based on the tSNE embeddings: {:.4}".format(scores.mean()))

R^2 of the Linear Regression based on the tSNE embeddings: 46.7319 %
RMSE of the Linear Regression based on the tSNE embeddings: -2.514


In [30]:
scores = cross_val_score(LR_model, X_train_ica, np.log(1 + y_train), scoring='r2', cv=10)
print("R^2 of the Linear Regression based on the ICA embeddings: {:.6} %".format(scores.mean()*100))
scores = cross_val_score(LR_model, X_train_ica, np.log(1 + y_train), scoring='neg_root_mean_squared_error', cv=10)
print("RMSE of the Linear Regression based on the ICA embeddings: {:.4} ".format(scores.mean()))

R^2 of the Linear Regression based on the ICA embeddings: 48.4111 %
RMSE of the Linear Regression based on the ICA embeddings: -2.475 


### 5.2 KNN regressor

In [38]:
KNN_model = KNeighborsRegressor(n_neighbors=5)

In [39]:
scores = cross_val_score(KNN_model, X_train_pca, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.2634309997284042

In [40]:
# np.random.seed(42)
scores = cross_val_score(KNN_model, X_train_tsne, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.09042428732665879

In [41]:
scores = cross_val_score(KNN_model, X_train_ica, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.44347836955066217

### 5.3 MLP regressor

In [51]:
MLP_model = MLPRegressor(hidden_layer_sizes=(1000, 500, 100, 50),
                         max_iter = 500, activation = 'relu',
                         learning_rate='constant', learning_rate_init=0.001, random_state=42, verbose=2)

In [52]:
scores = cross_val_score(MLP_model, X_train_pca, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

Iteration 1, loss = 36.95693924
Iteration 2, loss = 6.36688252
Iteration 3, loss = 4.03406057
Iteration 4, loss = 3.26344611
Iteration 5, loss = 2.76465178
Iteration 6, loss = 2.44208194
Iteration 7, loss = 2.16474717
Iteration 8, loss = 1.94335152
Iteration 9, loss = 1.72330647
Iteration 10, loss = 1.51754187
Iteration 11, loss = 1.33099203
Iteration 12, loss = 1.13690471
Iteration 13, loss = 0.96343873
Iteration 14, loss = 0.78672599
Iteration 15, loss = 0.62291965
Iteration 16, loss = 0.47338870
Iteration 17, loss = 0.34972080
Iteration 18, loss = 0.25160270
Iteration 19, loss = 0.18772211
Iteration 20, loss = 0.13250593
Iteration 21, loss = 0.09431538
Iteration 22, loss = 0.07553445
Iteration 23, loss = 0.06035883
Iteration 24, loss = 0.05113180
Iteration 25, loss = 0.03963203
Iteration 26, loss = 0.03093835
Iteration 27, loss = 0.02924439
Iteration 28, loss = 0.02894786
Iteration 29, loss = 0.02377310
Iteration 30, loss = 0.02107017
Iteration 31, loss = 0.01857009
Iteration 32, lo

0.38995859253524556

In [53]:
scores = cross_val_score(MLP_model, X_train_tsne, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

Iteration 1, loss = 25.99586203
Iteration 2, loss = 15.00317863
Iteration 3, loss = 13.05305599
Iteration 4, loss = 11.49492224
Iteration 5, loss = 10.09445265
Iteration 6, loss = 8.78628079
Iteration 7, loss = 7.45274258
Iteration 8, loss = 6.50768487
Iteration 9, loss = 5.64853689
Iteration 10, loss = 5.15634433
Iteration 11, loss = 4.69205623
Iteration 12, loss = 4.22036156
Iteration 13, loss = 3.93889647
Iteration 14, loss = 3.75991282
Iteration 15, loss = 3.87188854
Iteration 16, loss = 3.40348841
Iteration 17, loss = 3.33145918
Iteration 18, loss = 3.20530702
Iteration 19, loss = 3.24766458
Iteration 20, loss = 3.27457144
Iteration 21, loss = 3.09637055
Iteration 22, loss = 2.95440829
Iteration 23, loss = 3.28190708
Iteration 24, loss = 2.96884913
Iteration 25, loss = 2.88866545
Iteration 26, loss = 2.82081028
Iteration 27, loss = 2.78579233
Iteration 28, loss = 2.87726547
Iteration 29, loss = 2.97705752
Iteration 30, loss = 2.78311250
Iteration 31, loss = 2.69097253
Iteration 32

0.5324820262425003

In [54]:
scores = cross_val_score(MLP_model, X_train_ica, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

Iteration 1, loss = 54.01237199
Iteration 2, loss = 15.25618722
Iteration 3, loss = 6.94990339
Iteration 4, loss = 5.43376774
Iteration 5, loss = 4.30958484
Iteration 6, loss = 3.76846885
Iteration 7, loss = 3.39227551
Iteration 8, loss = 3.10585076
Iteration 9, loss = 2.89208684
Iteration 10, loss = 2.72889639
Iteration 11, loss = 2.56161795
Iteration 12, loss = 2.40269887
Iteration 13, loss = 2.28792865
Iteration 14, loss = 2.18403278
Iteration 15, loss = 2.04992104
Iteration 16, loss = 1.95218342
Iteration 17, loss = 1.85504620
Iteration 18, loss = 1.74110051
Iteration 19, loss = 1.64309967
Iteration 20, loss = 1.56446562
Iteration 21, loss = 1.44458933
Iteration 22, loss = 1.35935979
Iteration 23, loss = 1.29193164
Iteration 24, loss = 1.16706629
Iteration 25, loss = 1.10281916
Iteration 26, loss = 1.00738947
Iteration 27, loss = 0.90917337
Iteration 28, loss = 0.85539478
Iteration 29, loss = 0.78334872
Iteration 30, loss = 0.71308040
Iteration 31, loss = 0.64083704
Iteration 32, l

0.57037215181562

### 5.4 Random Forest Regressor

In [5]:
RFR_model = RandomForestRegressor(n_jobs=-1)

In [6]:
scores = cross_val_score(RFR_model, X_train_pca, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.701956181677617

In [7]:
scores = cross_val_score(RFR_model, X_train_tsne, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.7291422912052102

In [8]:
scores = cross_val_score(RFR_model, X_train_ica, np.log(1 + y_train), scoring='r2', cv=10)
scores.mean()

0.7037980851176935