In [None]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectPercentile, chi2, mutual_info_regression, SelectFromModel
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.manifold import TSNE
from sklearn.metrics import mean_squared_error

import torch
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras import optimizers
from keras.layers import BatchNormalization

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time
from pathlib import Path

## 5 Models

In [4]:
X1_pca = torch.load('X1_pca').to_numpy()
X1_ica = torch.load('X1_ica').to_numpy()
X1_tsne = torch.load('X1_tsne').to_numpy()
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue ']).to_numpy().ravel()
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X1_pca, Y1, random_state=42, test_size=0.1)
X_train_ica, X_test_ica, _, _ = train_test_split(X1_ica, Y1, random_state=42, test_size=0.1)
X_train_tsne, X_test_tsne, _, _ = train_test_split(X1_tsne, Y1, random_state=42, test_size=0.1)

### 5.2 KNN

In [5]:
knn = KNeighborsRegressor()
knn_k_range = list(range(1, 41))
knn_param_grid = {
    'n_neighbors': knn_k_range,
    'leaf_size': list(range(1, 101))
}

In [6]:
knn_grid_pca = GridSearchCV(knn, knn_param_grid, cv=10, scoring='neg_root_mean_squared_error', return_train_score=True,verbose=2)
knn_grid_ica = GridSearchCV(knn, knn_param_grid, cv=10, scoring='neg_root_mean_squared_error', return_train_score=True,verbose=2)
knn_grid_tsne = GridSearchCV(knn, knn_param_grid, cv=10, scoring='neg_root_mean_squared_error', return_train_score=True,verbose=2)

In [None]:
knn_grid_pca.fit(X_train_pca, np.log(1 + y_train))

Fitting 10 folds for each of 4000 candidates, totalling 40000 fits
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=1; total time=   0.0s
[CV] END .........................leaf_size=1, n_neighbors=2; total time=   0.0s
[CV] END .........................leaf_siz

In [None]:
print("The best parameter K is:", knn_grid_pca.best_params_)
print("The best RMSE is:", knn_grid_pca.best_score_)