In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [16]:
import os
cwd = os.getcwd()
project_path = (cwd, None)
while project_path[1] != "Code":
    project_path = os.path.split(project_path[0])
project_path = project_path[0]

In [17]:
data_dz = pd.read_csv(project_path + "/Datasets/data_dz.csv")
data_in = pd.read_csv(project_path + "/Datasets/data_in.csv")
data_ca = pd.read_csv(project_path + "/Datasets/data_ca.csv")

In [18]:
def generate_train_val_test_set(df, name):
    
    df = df.dropna()
    
    output = {"train": {}, "val": {}, "test": {}, "name": name}
    
    train = df.sample(frac=0.7)
    val_test = df.drop(train.index)
    
    val = val_test.sample(frac=0.7)
    test = val_test.drop(val.index)

    output["train"]["y"] = np.ravel(train[["FARE"]])
    output["val"]["y"] = np.ravel(val[["FARE"]])
    output["test"]["y"] = np.ravel(test[["FARE"]])
    
    
    output["train"]["dist"] = train[["Distance"]]
    output["val"]["dist"] = val[["Distance"]]
    output["test"]["dist"] = test[["Distance"]]

    output["train"]["simd"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]
    output["val"]["simd"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]
    output["test"]["simd"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]

    output["train"]["full"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["val"]["full"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["test"]["full"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    
    return output.copy()

In [19]:
dataset_dz = generate_train_val_test_set(data_dz, "datazone")
dataset_in = generate_train_val_test_set(data_in, "intermediate zone")
dataset_ca = generate_train_val_test_set(data_ca, "council area")

In [20]:
from sklearn.linear_model import LinearRegression

def fit_and_evaluate_LinReg(dataset, subset = "full", *args, **kwargs):
    model = LinearRegression(*args, **kwargs)
    model.fit(dataset["train"][subset], dataset["train"]["y"])
    test_score = model.score(dataset["val"][subset], dataset["val"]["y"])
    print(f"{subset} using {dataset['name']} dataset, score: {test_score}")
    return model

In [21]:
for subset in ("dist", "simd", "full"):
    for dataset in (dataset_dz, dataset_in, dataset_ca):
        fit_and_evaluate_LinReg(dataset, subset)

dist using datazone dataset, score: 0.7454216374954086
dist using intermediate zone dataset, score: 0.7730990377099377
dist using council area dataset, score: 0.7628373942830609
simd using datazone dataset, score: 0.21109865976194198
simd using intermediate zone dataset, score: 0.2097357731780457
simd using council area dataset, score: 0.4005856327364465
full using datazone dataset, score: 0.7729277775008446
full using intermediate zone dataset, score: 0.7931849201916199
full using council area dataset, score: 0.8024052384288308


In [23]:
for subset in ("dist", "simd", "full"):
    for dataset in (dataset_dz, dataset_in, dataset_ca):
        fit_and_evaluate_LinReg(dataset, subset, fit_intercept = False)

dist using datazone dataset, score: 0.730812771915907
dist using intermediate zone dataset, score: 0.7609415648881732
dist using council area dataset, score: 0.7506820407192061
simd using datazone dataset, score: 0.16021950520581218
simd using intermediate zone dataset, score: 0.17318786550880327
simd using council area dataset, score: 0.38081286168497264
full using datazone dataset, score: 0.7725656928714593
full using intermediate zone dataset, score: 0.7924411150163289
full using council area dataset, score: 0.7976622319663561
