In [1]:
# %pip install pandas
# %pip install scikit-learn
# %pip install numpy

In [86]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.metrics import mean_squared_error as mse
from matplotlib import pyplot as plt 
from sklearn.metrics import confusion_matrix

LR = LogisticRegression()
kf = RepeatedKFold(n_splits=10, n_repeats=5)

In [7]:
# Adding the truth values
ground_truth = pd.read_csv("ground_truth.csv", usecols=['time_local', 'Total_Number_Incidents'])
ground_truth['time_local'] = pd.to_datetime(ground_truth.time_local)

# Processing train.csv
df_3step = pd.read_csv("train_P.csv", usecols=['time', 'RUC', 'kappa', 'sliding_frame'], index_col=False)
df_3step['time'] = pd.to_datetime(df_3step.time)
df_3step = df_3step.merge(ground_truth, how="left", left_on="time", right_on='time_local')
df_3step['Total_Number_Incidents'] = df_3step['Total_Number_Incidents'].fillna(0)
n3s_ruc = df_3step[df_3step["RUC"] < 0]

In [95]:
#print(y_pred)
print(f"True Accidents: {np.count_nonzero(y_true)} True No Accidents: {len(y_true) - np.count_nonzero(y_true)}")
y_true = np.array(n3s_ruc["Total_Number_Incidents"])
best_min = 1000000
lowest_mse = 1000000
step_size = 0.0001
for i in range(0, 1000):
    y_pred = [1 if x > -i * step_size else 0 for x in np.array(n3s_ruc["RUC"])]
    if (saved_mse := mse(y_true, y_pred)) < lowest_mse:
        lowest_mse = saved_mse
        best_min = -i * step_size
    print(f"MSE:{saved_mse}\n")
    conf = confusion_matrix(y_true, y_pred)
    print(f"True Negetive: {conf[0][0]}, False Positive: {conf[0][1]}")
    print(f"False Negetive: {conf[1][0]}, True Positive: {conf[1][1]}")
print("Best Min", best_min)

True Accidents: 468 True No Accidents: 270232
MSE:0.0017288511267085334

True Negetive: 270232, False Positive: 0
False Negetive: 468, True Positive: 0
MSE:0.0037458441078684892

True Negetive: 269686, False Positive: 546
False Negetive: 468, True Positive: 0
MSE:0.005958625785001847

True Negetive: 269087, False Positive: 1145
False Negetive: 468, True Positive: 0
MSE:0.008130772072404877

True Negetive: 268499, False Positive: 1733
False Negetive: 468, True Positive: 0
MSE:0.010613224972294052

True Negetive: 267827, False Positive: 2405
False Negetive: 468, True Positive: 0
MSE:0.012523088289619505

True Negetive: 267309, False Positive: 2923
False Negetive: 467, True Positive: 1
MSE:0.014787587735500554

True Negetive: 266696, False Positive: 3536
False Negetive: 467, True Positive: 1
MSE:0.017838936091614334

True Negetive: 265870, False Positive: 4362
False Negetive: 467, True Positive: 1
MSE:0.019778352419652753

True Negetive: 265345, False Positive: 4887
False Negetive: 467, T

KeyboardInterrupt: 

### 3-step sliding frame dataset ###

In [3]:
# Filtered data and count (.shape)
df_3step = df_3step[(df_3step["sliding_frame"] == 3) & (df_3step["kappa"] == 0.25)]

# Positive (3, 0.25)
p3s_ruc = df_3step[df_3step["RUC"] > 0]
n_p = p3s_ruc.shape[0]

# Negative (3, 0.25)
n3s_ruc = df_3step[df_3step["RUC"] < 0]
n_n = n3s_ruc.shape[0]

x1 = np.array(range(0,n_p)).reshape(-1,1)
y1 = np.array(p3s_ruc["Total_Number_Incidents"])



# Residual derived from difference in predicted and true
# Logistic Regression with 70/30 train-test-split

x_train, x_test, y_train, y_test = train_test_split(x1,y1, test_size=0.3)

LR.fit(x_train, y_train)
y_hat1 = LR.predict(x_test)

# res = y1-y_hat1


# / We perform our machine learning models, and check its performance using the loss functions /
# We are doing a binary classification-type problem.
# We can study certain RUC values.? (confused)

In [4]:
# /Testing window /


# df_3step.to_csv('merged.csv', index = False)
# p3s_ruc.to_csv('merged.csv', index = False)

# p3s_ruc.head()
x1

array([[    0],
       [    1],
       [    2],
       ...,
       [21017],
       [21018],
       [21019]])

In [5]:
df = df_3step[df_3step["Total_Number_Incidents"] > 0]
NAME_OF_PREDICTION_VARIABLE = "RUC"
PERCENT_OF_ACCIDENTS_DETECTED = 0.95
ruc = round(df.quantile(PERCENT_OF_ACCIDENTS_DETECTED)[NAME_OF_PREDICTION_VARIABLE], 3)
print(f"We expect to be able to detect {PERCENT_OF_ACCIDENTS_DETECTED * 100}% of accidents if we treat all RUCs higher than {ruc} as accidents.")

We expect to be able to detect 95.0% of accidents if we treat all RUCs higher than 0.15 as accidents.


**Loss Functions and M-estimators**