In [1]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectPercentile, chi2, mutual_info_regression, SelectFromModel
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

import torch

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time
from pathlib import Path

In [19]:
X1_ready = torch.load('X1_ready_for_model')
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue '])
X1_ready.head()

Unnamed: 0,ratings,n_votes,production_year,runtime,release_year,Action,Animation,Crime,Drama,Family,...,emb_189,emb_190,emb_191,emb_192,emb_193,emb_194,emb_195,emb_196,emb_197,emb_198
0,0.606742,0.8092,1.111354,0.39846,0.933839,0.0,0.0,0.0,1.0,0.0,...,-0.020153,-0.015117,0.237029,-0.02805,-0.023873,0.079219,-0.031364,-0.071154,-0.091889,-0.235259
1,0.764045,-0.271776,-0.073389,0.079814,1.366959,1.0,0.0,1.0,1.0,0.0,...,-0.194736,0.081447,0.099483,-0.115929,0.10783,0.129517,-0.144803,-0.000834,-0.156755,0.159004
2,0.539326,-0.256258,-1.596629,1.099479,-2.531122,0.0,0.0,0.0,1.0,1.0,...,0.092746,0.02676,0.241836,0.224205,0.007772,0.136998,0.067925,-0.050262,0.116298,0.057964
3,0.617978,-0.215474,-0.242638,0.39846,-0.798642,0.0,0.0,0.0,1.0,0.0,...,0.22216,-0.061716,0.269019,-0.029332,0.159303,0.049171,0.240762,-0.297771,-0.147129,0.116596
4,0.337079,-0.265518,-1.258132,0.494053,-2.098002,0.0,0.0,1.0,1.0,0.0,...,-0.053639,-0.288431,0.096173,-0.151598,0.179133,0.025855,0.22104,0.155626,-0.151701,-0.016146


In [20]:
X_train, y_train = X1_ready.to_numpy(), Y1.to_numpy().ravel()

## 5. Models

### 5.1 Linear regression

In [21]:
LR_model = LinearRegression().fit(X_train, y_train)

In [23]:
LR_model.coef_

array([ 3.91796732e+07,  2.86716314e+07, -1.03671547e+06,  3.37001652e+06,
       -8.13546090e+06,  2.31374011e+07,  7.72336090e+06, -9.27714523e+06,
       -4.27171746e+06,  1.18321188e+07,  9.70757095e+06,  2.59074368e+06,
       -2.11575857e+06, -2.74855718e+06,  2.14840743e+06,  5.28482315e+07,
       -3.98231722e+06, -1.87282586e+06, -1.23125694e+06,  1.02758311e+09,
       -5.00127460e+05, -8.56010694e+05,  2.04044113e+04,  1.68138421e+06,
       -1.80015036e+06, -9.80176343e+05,  1.30901008e+06,  3.30675224e+06,
        1.24559574e+06, -3.35816962e+06,  8.50655814e+05,  1.00104568e+06,
        3.09361471e+06,  1.05416105e+06,  5.90275461e+06, -3.17572343e+06,
        1.60649340e+06, -1.45119924e+06,  1.22458586e+06,  5.22678015e+06,
        4.72006545e+06, -1.82312758e+06,  3.43960096e+06,  2.34060334e+06,
       -3.66932373e+06, -5.88853617e+06, -1.24679174e+06,  3.28693740e+06,
       -5.03087547e+06,  1.85766010e+06, -8.53125781e+05,  9.85930017e+05,
       -3.49554248e+06,  

In [38]:
# np.random.seed(42)
scores = cross_val_score(LR_model, X_train, np.log(1 + y_train), scoring='r2', cv=10)
scores

array([0.48648144, 0.52817494, 0.5093083 , 0.44898358, 0.49262462,
       0.4385946 , 0.49478958, 0.47264384, 0.45113356, 0.51837985])