**Packages and initialisations**

In [None]:
# %pip install pandas
# %pip install scikit-learn
# %pip install numpy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

### Loss Functions and M-estimators ###

In [None]:
# # MAE |y-h(x)|
def MAE(y, y_hat):
    return np.absolute(np.subtract(y_hat,y)).mean()
    
# # MSE (y-h(x))^2
def MSE(y, y_pred):
    # y_pred = np.dot(y_hat,theta)
    return np.square(np.subtract(y,y_pred)).mean()

''' Implement Scaling Hyperparams'''

# # Huber
def Hub(y,y_hat, n, b):     # finding optimal beta, returns ERV
    res=y-y_hat
    huber_lf_1 = []
    huber_lf_2 = []
    for j in b:
        for i in range (n):
            if res[i] < j:
                huber_lf_1.append(((res[i])**2)/2)
            else:
                huber_lf_2.append((j*abs(res[i])) - ((j**2)/2))
    if not huber_lf_1:
        return np.mean(huber_lf_2)
    elif not huber_lf_2:
        return np.mean(huber_lf_1)
    else:
        if min(huber_lf_2) > min(huber_lf_1):
            return np.mean(huber_lf_1)
        else:
            return np.mean(huber_lf_2)
# # Cauchy
def Cau(y,y_hat, n, b):      # finding optimal beta, returns ERV
    res=y-y_hat
    cauchy_lf = []
    for j in b:
        for i in range (n):
            cauchy_lf.append((j**2) * np.log(1+((res[i]**2)/(j**2))))
    return np.mean(cauchy_lf)



### Gradient-Descent isn't a high enough improvement to be implemented. (Too computationally intensive)

# def Grad_Desc_MSE(y_hat,y,theta,alpha,iters,tau_ruc):  # alpha = 0.05, iter = 5000
#     ls = []
#     for j in tau_ruc:
#         y_hat = j
#         for i in range(iters):
#             theta = theta - (2*alpha/len(y)) * (y_hat - y[i])
#         ls.append(theta)
#     print(ls)

### Data Processing ###

In [None]:
# Adding the truth values
ground_truth = pd.read_csv("ground_truth.csv", usecols=['time_local', 'Total_Number_Incidents'])
ground_truth['time_local'] = pd.to_datetime(ground_truth.time_local)

# Processing train.csv
df_3step = pd.read_csv("train_P.csv", usecols=['time', 'RUC', 'kappa', 'sliding_frame'], index_col=False)
df_3step['time'] = pd.to_datetime(df_3step.time)
df_3step = df_3step.merge(ground_truth, how="left", left_on="time", right_on='time_local')
df_3step['Total_Number_Incidents'] = df_3step['Total_Number_Incidents'].fillna(0)

# Filtered data and count (.shape)
df_3step = df_3step[(df_3step["sliding_frame"] == 3) & (df_3step["kappa"] == 0.25)]
df_3step = df_3step[df_3step["RUC"] != 0]

# Positive (3, 0.25)
p3s_ruc = df_3step[df_3step["RUC"] > 0]
n_p = p3s_ruc.shape[0]

# Negative (3, 0.25)
n3s_ruc = df_3step[df_3step["RUC"] < 0]
n_n = n3s_ruc.shape[0]

# Export for data visualisation
n3s_ruc.to_csv("merged.csv")

# Setting and splitting data for training into 60-40
x1 = np.array(n3s_ruc["Total_Number_Incidents"])
y1 = np.array(n3s_ruc["RUC"])
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.4)

### Training Model {Setting Parameters for beta & tau_ruc} ###

In [None]:
# / We derive our machine learning models from, and check, its performance using the loss functions /
# We are doing a threshold classification-type problem.
# Residual derived from difference in predicted and true

b_min = max(y1)
b_max = min(y1)
alpha = np.median(y1)
b_range = np.arange(b_min,b_max,alpha)

## max/min tau derived from highest/lowest RUC where accident happened

# # Tau Param 1
tau_max = min(y_train) 
tau_min = max(y_train)                                
alph0 = np.median(y_train)
tau_range = np.arange(tau_min,tau_max,alph0)

# Tau Param 2           ## Selecting range ends based on accidents happening
for i in x_train:
    if i == 1:                                     
        tau_max = min(y_train) 
        tau_min = max(y_train)                           
alph0 = tau_min
tau_range = np.arange(tau_min,tau_max,alph0)


# Tau Param 3           ## Selecting tau_min based on accidents, tau_max has no preference
for i in x_train:
    if i == 1:
        tau_min = max(y_train) 
tau_max = min(y_train)                          
alph0 = np.median(y_train)/2
tau_range = np.arange(tau_min,tau_max,alph0)

# Tau Param 4           ## Selecting tau_max based on accidents, tau_min has no preference
for i in x_train:
    if i == 1:
        tau_max = min(y_train) 
tau_min = max(y_train)                          
alph0 = np.median(y_train)/2
tau_range = np.arange(tau_min,tau_max,alph0)

### MSE MODEL ###

In [None]:
def run_MSE():

    best_min = 10           ## Setting best_min and threshold to arbitrary values
    threshold = 10

    for tau_ruc in tau_range:           ## Iterate tau_range and assign best_min and threshold each time, updating
        if (my_mse := MSE(y_train, tau_ruc)) < best_min:
            best_min = my_mse
            threshold = tau_ruc

    y_pred = [1 if x > threshold else 0 for x in y_test]        ## Predict based on y_test and threshold
    conf = confusion_matrix(x_test, y_pred)             

    print(f"True Negative: {conf[0][0]}, False Positive {conf[0][1]}, threshold: {threshold}")      ## Confusion matrix for true-false
    print(f"False Negative: {conf[1][0]}, True Positive {conf[1][1]}, MSE: {best_min}\n")

    score = (conf[1][1] + conf[0][0]) / len(x_test)

    return best_min,threshold,score

a = run_MSE()
print (a)

### MAE MODEL ###

In [None]:
def run_MAE():   

    best_min = 10
    threshold = 10

    for tau_ruc in tau_range:

        # scores_mae.append(MAE(y_train,tau_ruc))
        if (my_mae := MAE(y_train, tau_ruc)) < best_min:
            best_min = my_mae
            threshold = tau_ruc


    # Classify in test set
    y_pred = [1 if x > threshold else 0 for x in y_test]
    conf = confusion_matrix(x_test, y_pred)

    print(f"True Negative: {conf[0][0]}, False Positive {conf[0][1]}, threshold: {threshold}")
    print(f"False Negative: {conf[1][0]}, True Positive {conf[1][1]}, MAE: {best_min}\n")
    
    score = (conf[1][1] + conf[0][0]) / len(x_test)

    return best_min,threshold,score

b = run_MAE()
print (b)

### HUBER MODEL ###

In [None]:
def run_Hub():

    best_min = 10
    threshold = 10

    for tau_ruc in tau_range:                                 # current tau is tau_ruc, against all RUC in current y_train

        # scores_hub.append(Hub.one(y_train,tau_ruc, train_ix.shape[0], b_range))
        if (my_hub := Hub(y_train, tau_ruc, y_train.shape[0], b_range)) < best_min:
            best_min = my_hub
            threshold = tau_ruc
    
    y_pred = [1 if x > threshold else 0 for x in y_test]
    conf = confusion_matrix(x_test, y_pred)

    print(f"True Negative: {conf[0][0]}, False Positive {conf[0][1]}, threshold: {threshold}")
    print(f"False Negative: {conf[1][0]}, True Positive {conf[1][1]}, Huber: {best_min}\n")
    
    score = (conf[1][1] + conf[0][0]) / len(x_test)

    return best_min,threshold,score
    
c = run_Hub()
print (c)

### CAUCHY MODEL ###

In [None]:
def run_Cauc():

    best_min = 10
    threshold = 10

    for tau_ruc in tau_range:                                 # current tau is tau_ruc, against all RUC in current y_train

        # scores_cauc.append(Cau.one(y_train,tau_ruc, train_ix.shape[0], b_range))
        if (my_cauc := Cau(y_train, tau_ruc, y_train.shape[0], b_range)) < best_min:
            best_min = my_cauc
            threshold = tau_ruc
        
    y_pred = [1 if x > threshold else 0 for x in y_test]
    conf = confusion_matrix(x_test, y_pred)

    print(f"True Negative: {conf[0][0]}, False Positive {conf[0][1]}, threshold: {threshold}")
    print(f"False Negative: {conf[1][0]}, True Positive {conf[1][1]}, Cauchy: {best_min}\n")
    
    score = (conf[1][1] + conf[0][0]) / len(x_test)

    return best_min,threshold,score

d = run_Cauc()
print (d)

### ERV : Empirical Risk Value ###

In [None]:

erv = {a[0] : a[1], b[0] : b[1], c[0] : c[1], d[0] : d[1]}
print("Best Loss Function ERV: {:.6f}, and corresponding tau: {:.6f}".format(min(erv, key=erv.get), erv[min(erv, key=erv.get)]))

scores = {"MSE" : a[2], "MAE" : b[2], "Huber" : c[2], "Cauchy" : d[2]}
print("Best Model is {} with a prediction score of : {:.5f}".format(max(scores, key=scores.get), scores[max(scores, key=scores.get)]))