In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  from collections import Sequence


In [2]:
import os
cwd = os.getcwd()
project_path = (cwd, None)
while project_path[1] != "Code":
    project_path = os.path.split(project_path[0])
project_path = project_path[0]

In [3]:
data_dz = pd.read_csv(project_path + "/Datasets/data_dz.csv")
data_in = pd.read_csv(project_path + "/Datasets/data_in.csv")
data_ca = pd.read_csv(project_path + "/Datasets/data_ca.csv")

In [4]:
def generate_train_val_test_set(df, name):
    
    df = df.dropna()
    
    output = {"train": {}, "val": {}, "test": {}, "name": name}
    
    train = df.sample(frac=0.7)
    val_test = df.drop(train.index)
    
    val = val_test.sample(frac=0.7)
    test = val_test.drop(val.index)

    output["train"]["y"] = np.ravel(train[["FARE"]])
    output["val"]["y"] = np.ravel(val[["FARE"]])
    output["test"]["y"] = np.ravel(test[["FARE"]])
    
    
    output["train"]["dist"] = train[["Distance", ]]
    output["val"]["dist"] = val[["Distance"]]
    output["test"]["dist"] = test[["Distance"]]
    
    output["train"]["dist_remoteness"] = train[["Distance","Dist_from_Ed.origin", "Dist_from_Gls.origin", "Dist_from_Ed.destination", "Dist_from_Gls.destination"]]
    output["val"]["dist_remoteness"] = val[["Distance","Dist_from_Ed.origin", "Dist_from_Gls.origin", "Dist_from_Ed.destination", "Dist_from_Gls.destination"]]
    output["test"]["dist_remoteness"] = test[["Distance","Dist_from_Ed.origin", "Dist_from_Gls.origin", "Dist_from_Ed.destination", "Dist_from_Gls.destination"]]

    output["train"]["simd"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE", "Dist_from_Gls.origin", "Dist_from_Ed.destination", "Dist_from_Gls.destination"}))]
    output["val"]["simd"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE", "Dist_from_Gls.origin", "Dist_from_Ed.destination", "Dist_from_Gls.destination"}))]
    output["test"]["simd"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE", "Dist_from_Gls.origin", "Dist_from_Ed.destination", "Dist_from_Gls.destination"}))]

    output["train"]["full"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["val"]["full"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["test"]["full"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    
    return output.copy()

In [5]:
dataset_dz = generate_train_val_test_set(data_dz, "datazone")
dataset_in = generate_train_val_test_set(data_in, "intermediate zone")
dataset_ca = generate_train_val_test_set(data_ca, "council area")

datasets = (dataset_dz, dataset_in, dataset_ca)
subsets = ("dist", "simd", "full", "dist_remoteness")

In [6]:
import json

def append_to_json(data, path):
    try:
        with open(path, 'r') as f:
            existing = json.load(f)
            new_data = existing + [data]
    except FileNotFoundError:
        new_data = [data]
    with open(path, 'w') as f:
        json.dump(new_data, f)

In [7]:
def fit_and_evaluate_ML_model(method, method_name, dataset, subset = "full", log_name = None, *args, **kwargs):
    model = method(*args, **kwargs)
    model.fit(dataset["train"][subset], dataset["train"]["y"])
    val_score = model.score(dataset["val"][subset], dataset["val"]["y"])

    if log_name == -1:
        log_name = method_name
    
#     arg_output = "[" + ",".join(list(args) + [f"{k}->{v}" for k, v in kwargs.items()]) + "]"
#     output = "|".join([method_name, dataset["name"], subset, str(test_score), arg_output])
    
#     if not log_name is None:
#         with open(project_path + "/Logs/" +log_name + ".txt", 'a') as log:
#             log.write(output + '\n')
#     else:
#         print(output)

    output = kwargs.copy()
    output["method"] = method_name
    output["dataset"] = dataset["name"]
    output["subset"] = subset
    output["val_score"] = val_score
    
    if not log_name is None:
        append_to_json(output, project_path + "/Logs/" +log_name + ".json")
    else:
        print(output)
    
    return model

In [8]:
def check_on_all_data(method, method_name, log_name, *args, **kwargs):
    for dataset in datasets:
        for subset in subsets:
            try:
                fit_and_evaluate_ML_model(method, method_name, dataset, subset = subset, log_name = log_name, *args, **kwargs)
            except Exception as e:
                with open(project_path + "/Logs/errors.txt", 'a') as log:
                    log.write(",".join((method_name, dataset["name"], subset, str(e))))
                    
                    
def check_on_dataset(method, method_name, log_name, dataset = dataset_in, *args, **kwargs):
    for subset in subsets:
        try:
            fit_and_evaluate_ML_model(method, method_name, dataset, subset = subset, log_name = log_name, *args, **kwargs)
        except Exception as e:
            with open(project_path + "/Logs/errors.txt", 'a') as log:
                log.write(",".join((method_name, dataset["name"], subset, str(e))))

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [11]:

check_on_dataset(LinearRegression, "linreg", -1)

check_on_dataset(RandomForestRegressor, "rfr", -1)

In [12]:


kernels = ("linear", "poly", "rbf", "sigmoid")
for k in kernels:
    check_on_dataset(SVR, "svr", -1, kernel = k, max_iter = 10000)



In [13]:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

def rfe_model(final_model = LinearRegression, n_features=5, *args, **kwargs):
    model = final_model(*args, **kwargs)
    rfe = RFE(estimator = model, n_features_to_select = n_features)
    return Pipeline(steps = [('rfe', rfe), ("ml_model", model)])

for n in range(3, 15):
    check_on_dataset(rfe_model, "rfe_random_forest", -1, final_model = RandomForestRegressor, n_features = n)
    

KeyboardInterrupt: 

In [14]:
attrs = []
for n in range(3, 15):
    print(f"fitting {n} features")
    rfe = RFE(estimator = RandomForestRegressor(), n_features_to_select = n)
    rfe.fit(dataset_in["train"]["full"], dataset_in["train"]["y"])
    val_score = rfe.score(dataset_in["val"]["full"], dataset_in["val"]["y"])
    features = list(map(lambda x: x[0], filter(lambda x: x[1], zip(dataset_in["train"]["full"].columns, rfe.get_support()))))
    output = {"n": n, "features": features, "val_score": val_score}
    append_to_json(output, project_path + "/Logs/rfe_rfr.json")

fitting 3 features
fitting 4 features
fitting 5 features
fitting 6 features
fitting 7 features
fitting 8 features
fitting 9 features
fitting 10 features
fitting 11 features
fitting 12 features
fitting 13 features
fitting 14 features


In [15]:
from sklearn.preprocessing import StandardScaler

def scaled_RFR(*args, **kwargs):
    return Pipeline([
        ("scale", StandardScaler()),
        ("rfr",RandomForestRegressor(*args, **kwargs))
    ])

check_on_dataset(scaled_RFR, "rfr_scaled", -1)
    

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(dataset_in["train"]["full"])

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
pca.components_[0]