In [1]:
# %pip install pandas
# %pip install scikit-learn
# %pip install numpy

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold, train_test_split

LR = LogisticRegression()
kf = RepeatedKFold(n_splits=10, n_repeats=5)

### 3-step sliding frame dataset ###

In [5]:
# Adding the truth values
ground_truth = pd.read_csv("ground_truth.csv", usecols=['time_local', 'Total_Number_Incidents'])
ground_truth['time_local'] = pd.to_datetime(ground_truth.time_local)

# Processing train.csv
df_3step = pd.read_csv("train_P.csv", usecols=['time', 'RUC', 'kappa', 'sliding_frame'], index_col=False)
df_3step['time'] = pd.to_datetime(df_3step.time)
df_3step = df_3step.merge(ground_truth, how="left", left_on="time", right_on='time_local')
df_3step['Total_Number_Incidents'] = df_3step['Total_Number_Incidents'].fillna(0)



# Filtered data and count (.shape)
df_3step = df_3step[(df_3step["sliding_frame"] == 3) & (df_3step["kappa"] == 0.25)]

# Positive (3, 0.25)
p3s_ruc = df_3step[df_3step["RUC"] > 0]
n_p = p3s_ruc.shape[0]

# Negative (3, 0.25)
n3s_ruc = df_3step[df_3step["RUC"] < 0]
n_n = n3s_ruc.shape[0]

x1 = np.array(range(0,n_p)).reshape(-1,1)
y1 = np.array(p3s_ruc["RUC"])



# Residual derived from difference in predicted and true
# Logistic Regression with 70/30 train-test-split

x_train, x_test, y_train, y_test = train_test_split(x1,y1, test_size=0.3)

LR.fit(x_train, y_train)
y_hat1 = LR.predict(x_test)

# res = y1-y_hat1


# / We perform our machine learning models, and check its performance using the loss functions /
# We are doing a binary classification-type problem.
# We can study certain RUC values.? (confused)

ValueError: Unknown label type: 'continuous'

In [8]:
# /Testing window /


# df_3step.to_csv('merged.csv', index = False)
# p3s_ruc.to_csv('merged.csv', index = False)

# p3s_ruc.head()
x1

array([[    0],
       [    1],
       [    2],
       ...,
       [21017],
       [21018],
       [21019]])

In [None]:
df = df_3step[df_3step["Total_Number_Incidents"] > 0]
NAME_OF_PREDICTION_VARIABLE = "RUC"
PERCENT_OF_ACCIDENTS_DETECTED = 0.95
ruc = round(df.quantile(PERCENT_OF_ACCIDENTS_DETECTED)[NAME_OF_PREDICTION_VARIABLE], 3)
print(f"We expect to be able to detect {PERCENT_OF_ACCIDENTS_DETECTED * 100}% of accidents if we treat all RUCs higher than {ruc} as accidents.")

We expect to be able to detect 95.0% of accidents if we treat all RUCs higher than 0.122 as accidents.


**Loss Functions and M-estimators**

In [None]:
# # MA |y-h(x)|
def MAE(y, y_hat, n):
    # sum = 0
    # for i in range(n):
    #     sum += abs(y_hat[i] - y[i])
    # return sum/n
    return np.absolute(np.subtract(y_hat,y)).mean()
# # MS (y-h(x))^2
def MSE(y, y_hat):
    return np.square(np.subtract(y,y_hat)).mean()
''' Implement Scaling Hyperparams'''
# # Huber
class Hub():
    def one(res, n, b):     # finding optimal tau
        huber_lf_1 = []
        huber_lf_2 = []
        tau_list = []
        count = 0
        for j in b:
            tau_list.append(j)
            for i in range (n):
                if res[i] < j:
                    huber_lf_1.append(count + ((res[i])**2)/2)
                else:
                    huber_lf_2.append(count + (j*abs(res[i])) - ((j**2)/2))
            count += 1
        if huber_lf_1 == False:
            ix = int(str(int(min(huber_lf_2)))[:1])
            # print("Optimal Huber distance:", min(huber_lf_2))
            return (tau_list[ix], min(huber_lf_2))
        elif huber_lf_2 == False:
            ix = int(str(int(min(huber_lf_1)))[:1])
            return (tau_list[ix], min(huber_lf_1))

        else:
            if min(huber_lf_2) > min(huber_lf_1):
                ix = int(str(int(min(huber_lf_1)))[:1])
                return (tau_list[ix], min(huber_lf_1))
            else:
                ix = int(str(int(min(huber_lf_2)))[:1])
                return (tau_list[ix], min(huber_lf_1))

    def two(res, n, b):     # testing tau value, returns hub_loss for input tau
        huber_lf_1 = []
        huber_lf_2 = []
        for i in range (n):
            if res[i] < b:
                huber_lf_1.append(((res[i])**2)/2)
            else:
                huber_lf_2.append((b*abs(res[i])) - ((b**2)/2))
        if min(huber_lf_2) > min(huber_lf_1):
            return huber_lf_1
        else:
            return huber_lf_2
# # Cauchy
class Cau():
    def one(res, n, b):      # finding optimal tau
        cauchy_lf = []
        tau_list = []
        count = 0
        for j in b:
            tau_list.append(j)
            for i in range (n):
                cauchy_lf.append(count + ((j**2) * np.log(1+(((res[i])**2)/(j**2)))))
            count += 1

        ix = int(str(int(min(cauchy_lf)))[:1])
        return (tau_list[ix], min(cauchy_lf))
        
    def two(res, n, b):     # testing tau value, returns cauchy for input tau
        cauchy_lf = []
        for i in range (n):
            cauchy_lf.append(((b**2) * np.log(1+(((res[i])**2)/(b**2)))))
        return cauchy_lf


############# VALUES ################

# MAE/ MSE
abs_ruc_p = MAE(y1,y_hat1,n_p)
sq_ruc_p = MSE(y1,y_hat1)

# Tau hyperparameter range and step (alpha) 
b_min = min(y1)
b_max = max(y1)
alpha = np.median(y1)
b_p = np.arange(b_min,b_max,alpha)


# Objective: see if M-estimators perform better than MAE/MSE
# K-fold x-val 10 splits, repeated 5 times
scores_hub = []
scores_cauc = []
for train_ix, test_ix in kf.split(res):
    y_train = res[train_ix]
    scores_hub.append(Hub.one(y_train, train_ix.shape[0], b_p))
    scores_cauc.append(Cau.one(y_train, train_ix.shape[0], b_p))

# UD variable for tau_min
tmin_h = np.mean(scores_hub)
tmin_c= np.mean(scores_cauc)


print(tmin_c,tmin_h)
# print(sq_ruc_p,Hub.two(res,n_p,tmin_h))
# print("\nHuber ERV: ", hub_ruc_p, "\nCauchy ERV: ",cau_ruc_p, "\nMAE:" , abs_ruc_p,"\nMSE:" , sq_ruc_p)

1.3900039474232169e-06 1.3900319495150136e-06
