In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [12]:
import os
cwd = os.getcwd()
project_path = (cwd, None)
while project_path[1] != "Code":
    project_path = os.path.split(project_path[0])
project_path = project_path[0]

In [13]:
data_dz = pd.read_csv(project_path + "/Datasets/data_dz.csv")
data_in = pd.read_csv(project_path + "/Datasets/data_in.csv")
data_ca = pd.read_csv(project_path + "/Datasets/data_ca.csv")

In [14]:
def generate_train_val_test_set(df, name):
    
    df = df.dropna()
    
    output = {"train": {}, "val": {}, "test": {}, "name": name}
    
    train = df.sample(frac=0.7)
    val_test = df.drop(train.index)
    
    val = val_test.sample(frac=0.7)
    test = val_test.drop(val.index)

    output["train"]["y"] = np.ravel(train[["FARE"]])
    output["val"]["y"] = np.ravel(val[["FARE"]])
    output["test"]["y"] = np.ravel(test[["FARE"]])
    
    
    output["train"]["dist"] = train[["Distance"]]
    output["val"]["dist"] = val[["Distance"]]
    output["test"]["dist"] = test[["Distance"]]

    output["train"]["simd"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]
    output["val"]["simd"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]
    output["test"]["simd"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]

    output["train"]["full"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["val"]["full"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["test"]["full"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    
    return output.copy()

In [15]:
dataset_dz = generate_train_val_test_set(data_dz, "datazone")
dataset_in = generate_train_val_test_set(data_in, "intermediate zone")
dataset_ca = generate_train_val_test_set(data_ca, "council area")

datasets = (dataset_dz, dataset_in, dataset_ca)
subsets = ("dist", "simd", "full")

In [16]:
import json

def append_to_json(data, path):
    try:
        with open(path, 'r') as f:
            existing = json.load(f)
            new_data = existing + [data]
    except FileNotFoundError:
        new_data = [data]
    with open(path, 'w') as f:
        json.dump(new_data, f)

In [17]:
def fit_and_evaluate_ML_model(method, method_name, dataset, subset = "full", log_name = None, *args, **kwargs):
    model = method(*args, **kwargs)
    model.fit(dataset["train"][subset], dataset["train"]["y"])
    val_score = model.score(dataset["val"][subset], dataset["val"]["y"])

    if log_name == -1:
        log_name = method_name
    
#     arg_output = "[" + ",".join(list(args) + [f"{k}->{v}" for k, v in kwargs.items()]) + "]"
#     output = "|".join([method_name, dataset["name"], subset, str(test_score), arg_output])
    
#     if not log_name is None:
#         with open(project_path + "/Logs/" +log_name + ".txt", 'a') as log:
#             log.write(output + '\n')
#     else:
#         print(output)

    output = kwargs.copy()
    output["method"] = method_name
    output["dataset"] = dataset["name"]
    output["subset"] = subset
    output["val_score"] = val_score
    
    if not log_name is None:
        append_to_json(output, project_path + "/Logs/" +log_name + ".json")
    else:
        print(output)
    
    return model

In [18]:
def check_on_all_data(method, method_name, log_name, *args, **kwargs):
    for dataset in datasets:
        for subset in subsets:
            try:
                fit_and_evaluate_ML_model(method, method_name, dataset, subset = subset, log_name = log_name, *args, **kwargs)
            except Exception as e:
                with open(project_path + "/Logs/errors.txt", 'a') as log:
                    log.write(",".join((method_name, dataset["name"], subset, str(e))))

In [29]:


check_on_all_data(LinearRegression, "linreg", -1)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [30]:


check_on_all_data(RandomForestRegressor, "rfr", -1)

In [19]:


kernels = ("linear", "poly", "rbf", "sigmoid")
for k in kernels:
    check_on_all_data(SVR, "svr", -1, kernel = k, max_iter = 10000)



In [23]:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

def rfe_model(final_model = LinearRegression, n_features=5, *args, **kwargs):
    model = final_model(*args, **kwargs)
    rfe = RFE(estimator = model, n_features_to_select = n_features)
    return Pipeline(steps = [('rfe', rfe), ("ml_model", model)])

for n in range(3, 15):
    check_on_all_data(rfe_model, "rfe_random_forest", -1, final_model = RandomForestRegressor, n_features = n)
    

KeyboardInterrupt: 

In [27]:
dataset_in["train"]["full"]


In [42]:
attrs = []
for n in range(3, 15):
    print(f"fitting {n} features")
    rfe = RFE(estimator = RandomForestRegressor(), n_features_to_select = n)
    rfe.fit(dataset_in["train"]["full"], dataset_in["train"]["y"])
    val_score = rfe.score(dataset_in["val"]["full"], dataset_in["val"]["y"])
    features = list(map(lambda x: x[0], filter(lambda x: x[1], zip(dataset_in["train"]["full"].columns, rfe.get_support()))))
    output = {"n": n, "features": features, "val_score": val_score}
    append_to_json(output, project_path + "/Logs/rfe_rfr.json")

fitting 3 features
fitting 4 features
fitting 5 features
fitting 6 features
fitting 7 features
fitting 8 features
fitting 9 features
fitting 10 features
fitting 11 features
fitting 12 features
fitting 13 features
fitting 14 features


In [39]:
from sklearn.preprocessing import StandardScaler

def scaled_RFR(*args, **kwargs):
    return Pipeline([
        ("scale", StandardScaler()),
        ("rfr",RandomForestRegressor(*args, **kwargs))
    ])

check_on_all_data(scaled_RFR, "rfr_scaled", -1)
    

In [43]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(dataset_in["train"]["full"])

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [48]:
np.cumsum(pca.explained_variance_ratio_)

array([0.69252126, 0.86339337, 0.94631817, 0.9626189 , 0.97372533,
       0.98061898, 0.98595923, 0.98999985, 0.99277191, 0.99517099,
       0.99660473, 0.9975907 , 0.99829402, 0.99879811, 0.99924563,
       0.99949771, 0.99972225, 0.99987542, 0.99996104, 0.99998371,
       0.99999169, 0.99999577, 0.99999768, 0.99999903, 0.99999966,
       0.99999982, 0.99999991, 0.99999997, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        ])

In [47]:
pca.components_[0]

array([ 4.06688755e-02,  4.31673166e-04,  4.80154142e-04, -4.95626470e-04,
        3.93083204e-06,  1.63347368e-02,  3.06149015e-02,  4.60951671e-03,
       -4.91695638e-04,  1.46756952e-02,  1.29587525e-02, -2.35196926e-04,
        4.50900829e-03,  8.42893701e-06,  1.37922196e-04,  5.35586498e-03,
        4.12925183e-04,  2.68233657e-02,  2.32576956e-02, -3.44793017e-04,
        2.87570064e-06,  4.54010211e-04,  9.95386622e-01,  4.46815648e-07,
        5.21794279e-03,  2.35434650e-02,  1.67887470e-02,  4.01887213e-02,
        2.72068815e-02, -2.32321225e-04,  3.42636340e-05,  1.33716777e-02,
        3.47104497e-05, -3.36396685e-06,  5.06497016e-06,  1.00508421e-04,
        3.83515796e-04, -3.48163972e-04,  1.08072927e-03,  3.16956308e-02,
       -3.37095447e-06,  1.51073684e-02,  2.85769383e-04])