## 1. Load the packages

In [1]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SequentialFeatureSelector, RFE, SelectPercentile, chi2, mutual_info_regression, SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import torch

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time

## 2. Read the data

In [30]:
X1 = torch.load('X1_ready')
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue ']).values.ravel()
X2 = torch.load("X2_ready")

## 4. Feature Selection

### 4.1 PCA

In [20]:
# scale the data to the range between 0 and 1 before using PCA
scaler = MinMaxScaler()
X1_scaled = scaler.fit_transform(X1)
pca = PCA(n_components=0.9)
X1_dr = pd.DataFrame(pca.fit_transform(X1_scaled))

In [21]:
X1_dr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,202,203,204,205,206,207,208,209,210,211
0,1.074424,-2.161530,-0.936288,-0.053778,1.190090,0.375195,1.125037,-0.947954,1.417951,0.023848,...,-0.101279,-0.026644,0.033659,-0.027399,0.055939,-0.115115,0.015425,-0.179017,-0.026631,-0.200034
1,2.144452,-1.407100,0.007022,0.448588,0.909266,0.338555,-2.144590,1.463919,-1.829458,0.032558,...,0.026538,0.094841,0.127622,0.007805,0.106799,-0.076922,-0.088961,-0.075707,0.057985,-0.147128
2,-0.865928,1.190855,-0.894807,-1.536223,-0.991711,0.593091,0.408147,-1.477073,-0.345129,-0.341638,...,0.022256,0.178123,0.004219,0.111407,0.118543,-0.032966,0.192540,-0.087592,-0.059001,0.183067
3,-2.740416,-0.882269,-0.098248,0.246708,0.697753,-0.663753,1.717218,0.148337,1.399702,-0.198154,...,0.069559,0.053045,-0.047770,0.197331,-0.027902,-0.163518,0.316982,0.145173,0.046951,-0.206452
4,-0.184803,-1.428510,1.580334,0.032370,0.421992,1.285152,-0.544388,0.046470,-0.835224,0.042241,...,0.210758,0.132003,-0.134942,-0.170740,0.079117,-0.346379,0.004720,0.049966,-0.030792,0.065447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3535,2.193360,-1.654529,-1.687835,-0.999234,-0.554242,0.728890,0.072859,-0.743913,0.554547,0.498505,...,0.021576,0.158498,-0.208196,0.310086,-0.039214,0.130943,0.042784,-0.132897,0.208451,0.138824
3536,2.303812,0.570052,-1.945312,1.475385,-1.034361,0.345514,-0.405876,0.615252,-0.698974,0.581836,...,-0.082416,-0.030385,0.083231,0.167702,0.032957,-0.275276,0.290022,-0.011803,0.288836,-0.507725
3537,-0.856999,0.828311,-2.190585,-0.892538,0.265036,1.040558,-0.211797,-0.141693,0.558447,-0.101539,...,0.304015,-0.158117,-0.199135,0.205695,-0.454384,0.135780,0.258396,-0.141307,-0.073912,0.072626
3538,-2.042392,-1.089467,1.048786,-1.226227,0.602062,1.934884,-0.200589,1.390398,2.460879,0.633317,...,-0.152947,-0.036885,0.124885,-0.011633,0.108943,0.222807,0.344395,-0.003310,0.157921,-0.258083


## 5. Models

In [32]:
model = RandomForestRegressor()
model.fit(X1, Y1)
importance = model.feature_importances_

KeyboardInterrupt: 

In [31]:
Y1.ravel()

[71585301.0692343,
 89648.34474017745,
 31649074.650306743,
 38981968.84210526,
 9697023.469430052,
 52094.439024390245,
 76951369.58885017,
 192351.09410864572,
 9949811.762090048,
 63056.40348464007,
 6747.143968871595,
 22050233.500311527,
 19035828.512820512,
 53234585.6,
 23117122.49350649,
 2304.0,
 295950.81910274964,
 309978.22950819664,
 116851917.30631934,
 126768383.52883434,
 2512.0,
 40671957.88904969,
 551577.5012059816,
 68879035.21259841,
 8182.323513062812,
 15824.85462145843,
 102550.04341534009,
 87051712.38554217,
 57226353.96934509,
 358684.78649789025,
 246888584.7123543,
 16070362.220264316,
 309.12,
 3627700.1904761903,
 127053.55330438976,
 58645242.52252253,
 379908.2842732691,
 7259.476971942826,
 67475.25333333333,
 959367.9750778816,
 5753922.986993114,
 130800716.4134727,
 7040.713423130515,
 668334.3622047243,
 2449.902097902098,
 35322835.2,
 236228.66184274,
 2942.363535297387,
 38464.42685638061,
 6828.799999999999,
 2206.83112758073,
 9071461.43779777