## Packages and initialisations ##

In [12]:
# %pip install pandas
# %pip install scikit-learn
# %pip install numpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.metrics import mean_squared_error as mse

kf = RepeatedKFold(n_splits=10, n_repeats=5)

**Loss Functions and M-estimators**

In [44]:
# # MA |y-h(x)|
def MAE(y, y_hat):
    return np.absolute(np.subtract(y_hat,y)).mean()
    
# # MS (y-h(x))^2
def MSE(y, y_hat):
    return np.square(np.subtract(y,y_hat)).mean()

''' Implement Scaling Hyperparams'''

# # Huber
class Hub():
    def one(y,y_hat, n, b):     # finding optimal tau
        res=y-y_hat
        huber_lf_1 = []
        huber_lf_2 = []
        for j in b:
            for i in range (n):
                if res[i] < j:
                    huber_lf_1.append(((res[i])**2)/2)
                else:
                    huber_lf_2.append((j*abs(res[i])) - ((j**2)/2))
        if huber_lf_1 == False:
            return min(huber_lf_2)
        elif huber_lf_2 == False:
            return min(huber_lf_1)

        else:
            if min(huber_lf_2) > min(huber_lf_1):
                return min(huber_lf_1)
            else:
                return min(huber_lf_2)

    def two(y,y_hat, n, b):     # testing tau value
        res=y-y_hat
        huber_lf_1 = []
        huber_lf_2 = []
        for i in range (n):
            if res[i] < b:
                huber_lf_1.append(((res[i])**2)/2)
            else:
                huber_lf_2.append((b*abs(res[i])) - ((b**2)/2))
        if min(huber_lf_2) > min(huber_lf_1):
            return min(huber_lf_1)
        else:
            return min(huber_lf_2)
# # Cauchy
class Cau():
    def one(y,y_hat, n, b):      # finding optimal tau
        res=y-y_hat
        cauchy_lf = []
        for j in b:
            for i in range (n):
                cauchy_lf.append((j**2) * np.log(1+(((res[i])**2)/(j**2))))
        return min(cauchy_lf)
        
    def two(y,y_hat, n, b):     # testing tau value
        res=y-y_hat
        cauchy_lf = []
        for i in range (n):
            cauchy_lf.append(((b**2) * np.log(1+(((res[i])**2)/(b**2)))))
        return min(cauchy_lf)

## Data Processing ##

In [40]:
# Adding the truth values
ground_truth = pd.read_csv("ground_truth.csv", usecols=['time_local', 'Total_Number_Incidents'])
ground_truth['time_local'] = pd.to_datetime(ground_truth.time_local)

# Processing train.csv
df_3step = pd.read_csv("train_P.csv", usecols=['time', 'RUC', 'kappa', 'sliding_frame'], index_col=False)
df_3step['time'] = pd.to_datetime(df_3step.time)
df_3step = df_3step.merge(ground_truth, how="left", left_on="time", right_on='time_local')
df_3step['Total_Number_Incidents'] = df_3step['Total_Number_Incidents'].fillna(0)

# Filtered data and count (.shape)
df_3step = df_3step[(df_3step["sliding_frame"] == 3) & (df_3step["kappa"] == 0.25)]

# Positive (3, 0.25)
p3s_ruc = df_3step[df_3step["RUC"] > 0]
n_p = p3s_ruc.shape[0]

# Negative (3, 0.25)
n3s_ruc = df_3step[df_3step["RUC"] < 0]
n_n = n3s_ruc.shape[0]

### Training Model ###

In [46]:

# / We derive our machine learning models from, and check, its performance using the loss functions /
# We are doing a threshold classification-type problem.
# Residual derived from difference in predicted and true

y1 = np.array(n3s_ruc["RUC"])
# y_train, y_test = train_test_split(y1)

b_min = max(y1)
b_max = min(y1)
alpha = np.median(y1)
b_range = np.arange(b_min,b_max,alpha)


scores_hub = []
scores_cauc = []
scores_mae = []
scores_mse = []


for train_ix, test_ix in kf.split(y1):  # train-test split   
    y_train = y1[train_ix]
    tau_max = max(y_train)
    alph0 = max(y_train)/2
    tau_range = np.arange(0,tau_max,alph0)
    for y_hat1 in tau_range:                                 # range of tau (y-intercept at horizontal)
        scores_mae.append(MAE(y_train,y_hat1))               # current tau is y_hat, against all RUC in current y_train
        scores_mse.append(MSE(y_train,y_hat1))
        scores_hub.append(Hub.one(y_train,y_hat1, train_ix.shape[0], b_range))
        scores_cauc.append(Cau.one(y_train,y_hat1, train_ix.shape[0], b_range))

# ERV : Empirical Risk Value
print("\nHuber ERV: ", np.mean(scores_hub), "\nCauchy ERV: ", np.mean(scores_cauc), "\nMAE:" , np.mean(scores_mae),"\nMSE:" , np.mean(scores_mse))


# TODO : Make it predict with ground truths.








# y_pred = np.array(p3s_ruc["RUC"])
# y_true = np.array(p3s_ruc["Total_Number_Incidents"])
# best_min = 1000000
# for i in [-x / 1000 for x in range(0, 1000)]:
#     if mse(y_true, y_pred, squared=False) < best_min:
#         best_min = i
# print(i)
# print(y_pred)
# print(y_true)
# plt.plot(y_pred, n3s_ruc['time'])




Huber ERV:  -0.23201388751585086 
Cauchy ERV:  1.7543718818980845e-12 
MAE: 0.055748005547666686 
MSE: 0.006076495779903645


In [15]:
df = df_3step[df_3step["Total_Number_Incidents"] > 0]
NAME_OF_PREDICTION_VARIABLE = "RUC"
PERCENT_OF_ACCIDENTS_DETECTED = 0.95
ruc = round(df.quantile(PERCENT_OF_ACCIDENTS_DETECTED)[NAME_OF_PREDICTION_VARIABLE], 3)
print(f"We expect to be able to detect {PERCENT_OF_ACCIDENTS_DETECTED * 100}% of accidents if we treat all RUCs higher than {ruc} as accidents.")

We expect to be able to detect 95.0% of accidents if we treat all RUCs higher than 0.15 as accidents.
