In [2]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE, SelectPercentile, chi2, mutual_info_regression, SelectFromModel
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.manifold import TSNE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor

import torch
# from keras.models import Sequential
# from keras.layers import Activation, Dense
# from keras import optimizers
# from keras.layers import BatchNormalization

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time
from pathlib import Path

## 5. Models

In [3]:
X1_pca = torch.load('X1_pca').to_numpy()
X1_ica = torch.load('X1_ica').to_numpy()
X1_tsne = torch.load('X1_tsne').to_numpy()
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue ']).to_numpy().ravel()
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X1_pca, Y1, random_state=42, test_size=0.1)
X_train_ica, X_test_ica, _, _ = train_test_split(X1_ica, Y1, random_state=42, test_size=0.1)
X_train_tsne, X_test_tsne, _, _ = train_test_split(X1_tsne, Y1, random_state=42, test_size=0.1)

### 5.3 MLP

In [4]:
mlp_pca = MLPRegressor(
    hidden_layer_sizes=(200,),
    learning_rate='constant',
    solver='adam',
    activation='tanh',
    alpha=0.0001,
    batch_size=177,
    random_state=42, max_iter=1000, verbose=0
)

In [9]:
mlp_pca.fit(X_train_pca, np.log(1 + y_train))
y_pred_mlp_pca = mlp_pca.predict(X_test_pca)
np.sqrt(mean_squared_error(y_test, y_pred_mlp_pca)) / 1000000

106166387.97631922

In [10]:
np.sqrt(mean_squared_error(y_test, y_pred_mlp_pca)) / 1000000

106.16638797631923

In [11]:
y_pred_mlp_pca

array([14.91392403,  9.94466574, 12.23200243,  9.54432502, 20.08570449,
       16.4579653 , 12.55734542, 14.574897  , 13.69900683, 13.36786788,
        8.05379825, 12.15328267, 11.97458867, 13.72653915, 19.52500995,
       13.4031393 , 16.31331209, 10.13958826, 13.15031006, 14.75320118,
       14.36289887,  7.90752594, 11.60838722, 14.80380389,  8.88485612,
        9.4586587 ,  8.5550648 , 14.6955398 , 18.816076  , 11.79554518,
       12.78723447, 17.22797037, 15.93752824, 13.74643888,  8.68977534,
       14.67886839, 19.02382745, 15.59249882, 16.55198908, 10.57305712,
       10.06769406, 11.00349079, 16.08123733, 11.35744803, 15.97923442,
        8.96872276, 12.85142091,  9.84687765, 18.17262348, 19.06807919,
       14.0550641 , 11.55196052, 10.06692335, 12.43944572, 12.08596239,
       10.166642  , 15.18635738, 11.82603874, 21.01921304, 20.17560316,
        9.75621586, 11.12564806, 11.89408707, 11.87958991, 19.89459772,
        8.74841509, 11.81569479,  9.67147265, 12.30509841,  9.26

In [18]:
y_pred_mlp_pca = np.exp(mlp_pca.predict(X_test_pca)) - 1
np.vstack([y_pred_mlp_pca, y_test]).T

array([[2.99940260e+06, 1.20116163e+07],
       [2.08397553e+04, 3.33241486e+04],
       [2.05252776e+05, 2.38523981e+06],
       [1.39642171e+04, 9.38601857e+03],
       [5.28579874e+08, 4.92727657e+07],
       [1.40476437e+07, 7.38885174e+06],
       [2.84173970e+05, 2.54351662e+04],
       [2.13696367e+06, 3.05710839e+07],
       [8.90025777e+05, 6.63959823e+07],
       [6.39132017e+05, 2.10656915e+07],
       [3.14472054e+03, 3.23314286e+03],
       [1.89714844e+05, 1.36014586e+07],
       [1.58670082e+05, 1.54136935e+05],
       [9.14870729e+05, 1.45145923e+06],
       [3.01719985e+08, 1.38923527e+08],
       [6.62077428e+05, 7.79718400e+04],
       [1.21557396e+07, 2.45338577e+08],
       [2.53250367e+04, 3.74244540e+04],
       [5.14169429e+05, 1.52487329e+05],
       [2.55407529e+06, 2.88024710e+07],
       [1.72873120e+06, 2.00413946e+05],
       [2.71665850e+03, 2.64205128e+03],
       [1.10015674e+05, 7.83838358e+06],
       [2.68664434e+06, 2.08384060e+07],
       [7.220775

In [20]:
np.sqrt(mean_squared_error(y_test, y_pred_mlp_pca))

179912768.69029018

In [13]:
np.sqrt(mean_squared_error(y_test, y_pred_mlp_pca)) / 1000000

179.9127686902902

In [17]:
y_test

array([1.20116163e+07, 3.33241486e+04, 2.38523981e+06, 9.38601857e+03,
       4.92727657e+07, 7.38885174e+06, 2.54351662e+04, 3.05710839e+07,
       6.63959823e+07, 2.10656915e+07, 3.23314286e+03, 1.36014586e+07,
       1.54136935e+05, 1.45145923e+06, 1.38923527e+08, 7.79718400e+04,
       2.45338577e+08, 3.74244540e+04, 1.52487329e+05, 2.88024710e+07,
       2.00413946e+05, 2.64205128e+03, 7.83838358e+06, 2.08384060e+07,
       8.00313133e+03, 1.33452812e+04, 5.03613512e+03, 1.01475701e+08,
       1.70959440e+08, 1.33998569e+05, 5.01905352e+04, 4.47800417e+07,
       5.67358080e+06, 6.49168000e+04, 6.01183939e+03, 1.02739592e+04,
       7.79730116e+06, 4.58846617e+06, 9.45002040e+05, 3.68237419e+04,
       2.44507913e+04, 5.98407459e+04, 2.28288389e+08, 1.29727408e+07,
       7.14495278e+07, 6.56229387e+04, 2.93736411e+05, 1.89995781e+04,
       9.24897676e+07, 3.67741633e+08, 7.15205066e+06, 9.97219740e+04,
       8.86482509e+04, 5.36664603e+05, 7.65330608e+03, 2.70600951e+04,
      