In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [9]:
import os
cwd = os.getcwd()
project_path = (cwd, None)
while project_path[1] != "Code":
    project_path = os.path.split(project_path[0])
project_path = project_path[0]

In [10]:
data_dz = pd.read_csv(project_path + "/Datasets/data_dz.csv")
data_in = pd.read_csv(project_path + "/Datasets/data_in.csv")
data_ca = pd.read_csv(project_path + "/Datasets/data_ca.csv")

In [11]:
def generate_train_val_test_set(df, name):
    
    df = df.dropna()
    
    output = {"train": {}, "val": {}, "test": {}, "name": name}
    
    train = df.sample(frac=0.7)
    val_test = df.drop(train.index)
    
    val = val_test.sample(frac=0.7)
    test = val_test.drop(val.index)

    output["train"]["y"] = np.ravel(train[["FARE"]])
    output["val"]["y"] = np.ravel(val[["FARE"]])
    output["test"]["y"] = np.ravel(test[["FARE"]])
    
    
    output["train"]["dist"] = train[["Distance"]]
    output["val"]["dist"] = val[["Distance"]]
    output["test"]["dist"] = test[["Distance"]]

    output["train"]["simd"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]
    output["val"]["simd"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]
    output["test"]["simd"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "Distance", "FARE"}))]

    output["train"]["full"] = train[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["val"]["full"] = val[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    output["test"]["full"] = test[list(set(df.columns).difference({"ORIGIN_CODE", "DESTINATION_CODE", "FARE"}))]
    
    return output.copy()

In [12]:
dataset_dz = generate_train_val_test_set(data_dz, "datazone")
dataset_in = generate_train_val_test_set(data_in, "intermediate zone")
dataset_ca = generate_train_val_test_set(data_ca, "council area")

In [13]:
from sklearn.linear_model import LinearRegression

def fit_and_evaluate_LinReg(dataset, subset = "full"):
    model = LinearRegression()
    model.fit(dataset["train"][subset], dataset["train"]["y"])
    test_score = model.score(dataset["val"][subset], dataset["val"]["y"])
    print(f"{subset} using {dataset['name']} dataset, score: {test_score}")
    return model

In [14]:
for subset in ("dist", "simd", "full"):
    for dataset in (dataset_dz, dataset_in, dataset_ca):
        fit_and_evaluate_LinReg(dataset, subset)

dist using datazone dataset, score: 0.7893463154765963
dist using intermediate zone dataset, score: 0.759110096378808
dist using council area dataset, score: 0.7671929993905116
simd using datazone dataset, score: 0.22871732240725895
simd using intermediate zone dataset, score: 0.21833619194192466
simd using council area dataset, score: 0.42242416975360647
full using datazone dataset, score: 0.8120896717263887
full using intermediate zone dataset, score: 0.7806813625272585
full using council area dataset, score: 0.8055409211768857
