In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [7]:
from data_helper import *

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [4]:

check_on_all_data(LinearRegression, "linreg", -1)

check_on_all_data(RandomForestRegressor, "rfr", -1)

In [9]:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

kernels = ("linear", "poly", "rbf", "sigmoid")
for k in kernels:
    check_on_dataset(SVR, "svr", -1, kernel = k, max_iter = 10000)

def rfe_model(final_model = LinearRegression, n_features=5, *args, **kwargs):
    model = final_model(*args, **kwargs)
    rfe = RFE(estimator = model, n_features_to_select = n_features)
    return Pipeline(steps = [('rfe', rfe), ("ml_model", model)])

for n in range(1, 15):
    check_on_dataset(rfe_model, "rfe_random_forest", -1, final_model = RandomForestRegressor, n_features = n)
    

In [6]:
attrs = []
for n in range(1, 15):
    print(f"fitting {n} features")
    rfe = RFE(estimator = RandomForestRegressor(), n_features_to_select = n)
    rfe.fit(dataset_in["train"]["full"], dataset_in["train"]["y"])
    val_score = rfe.score(dataset_in["val"]["full"], dataset_in["val"]["y"])
    features = list(map(lambda x: x[0], filter(lambda x: x[1], zip(dataset_in["train"]["full"].columns, rfe.get_support()))))
    output = {"n": n, "features": features, "val_score": val_score}
    append_to_json(output, project_path + "/Logs/rfe_rfr.json")

fitting 1 features
fitting 2 features
fitting 3 features
fitting 4 features
fitting 5 features
fitting 6 features
fitting 7 features
fitting 8 features
fitting 9 features
fitting 10 features
fitting 11 features
fitting 12 features
fitting 13 features
fitting 14 features


In [7]:
from sklearn.preprocessing import StandardScaler

def scaled_RFR(*args, **kwargs):
    return Pipeline([
        ("scale", StandardScaler()),
        ("rfr",RandomForestRegressor(*args, **kwargs))
    ])

check_on_dataset(scaled_RFR, "rfr_scaled", -1)
    

In [10]:
from sklearn.decomposition import PCA

def PCA_model(final_model = LinearRegression, n_components=5, *args, **kwargs):
    model = final_model(*args, **kwargs)
    pca = PCA(n_components = n_components)
    return Pipeline(steps = [('pca', pca), ("ml_model", model)])

for n in range(1, 15):
    fit_and_evaluate_ML_model(PCA_model, "pca_random_forest", dataset_in, log_name = -1, final_model = RandomForestRegressor, n_components = n)

Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model
Deleted final_model


In [11]:
for n in range(1, 15):
    check_on_dataset(PCA_model, "pca_lin_reg", -1, n_components = n)

In [12]:
model = PCA_model()
model.fit(dataset_in["train"]["full"], dataset_in["train"]["y"])

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('ml_model', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [13]:
model.score(dataset_in["val"]["full"], dataset_in["val"]["y"])

0.8958601722625366

In [19]:
str(type(RandomForestRegressor))

"<class 'abc.ABCMeta'>"