In [1]:
# %pip install pandas
# %pip install scikit-learn
# %pip install numpy

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.metrics import mean_squared_error as mse
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix


LR = LogisticRegression()
kf = RepeatedKFold(n_splits=10, n_repeats=5)

In [3]:
# Adding the truth values
ground_truth = pd.read_csv("ground_truth.csv", usecols=['time_local', 'Total_Number_Incidents'])
ground_truth['time_local'] = pd.to_datetime(ground_truth.time_local)

# Processing train.csv
df_3step = pd.read_csv("train_P.csv", usecols=['time', 'RUC', 'kappa', 'sliding_frame'], index_col=False)
df_3step['time'] = pd.to_datetime(df_3step.time)
df_3step = df_3step.merge(ground_truth, how="left", left_on="time", right_on='time_local')
df_3step['Total_Number_Incidents'] = df_3step['Total_Number_Incidents'].fillna(0)
n3s_ruc = df_3step[df_3step["RUC"] < 0]

In [15]:
y_pred = np.array(n3s_ruc["RUC"])
y_true = np.array(n3s_ruc["Total_Number_Incidents"])
STEP_SIZE = 0.0001
best_min = 1000000
threshold = 100000
for i in range(0, 100):
    y_pred = [1 if x > -i * STEP_SIZE else 0 for x in n3s_ruc["RUC"]]
    if (my_mse := mse(y_true, y_pred)) < best_min:
        best_min = my_mse
        threshold = i
    conf = confusion_matrix(y_true, y_pred)
    print(f"True Negative: {conf[0][0]}, False Positive {conf[0][1]}")
    print(f"False Negative: {conf[1][0]}, True Positive {conf[1][1]}, MSE: {my_mse}\n")
print(i)
print(y_pred)
print(y_true)


True Negative: 270232, False Positive 0
False Negative: 468, True Positive 0, MSE: 0.0017288511267085334

True Negative: 269686, False Positive 546
False Negative: 468, True Positive 0, MSE: 0.0037458441078684892

True Negative: 269087, False Positive 1145
False Negative: 468, True Positive 0, MSE: 0.005958625785001847

True Negative: 268499, False Positive 1733
False Negative: 468, True Positive 0, MSE: 0.008130772072404877

True Negative: 267827, False Positive 2405
False Negative: 468, True Positive 0, MSE: 0.010613224972294052

True Negative: 267309, False Positive 2923
False Negative: 467, True Positive 1, MSE: 0.012523088289619505

True Negative: 266696, False Positive 3536
False Negative: 467, True Positive 1, MSE: 0.014787587735500554

True Negative: 265870, False Positive 4362
False Negative: 467, True Positive 1, MSE: 0.017838936091614334

True Negative: 265345, False Positive 4887
False Negative: 467, True Positive 1, MSE: 0.019778352419652753

True Negative: 264759, False P

### 3-step sliding frame dataset ###

In [None]:




# Filtered data and count (.shape)
df_3step = df_3step[(df_3step["sliding_frame"] == 3) & (df_3step["kappa"] == 0.25)]

# Positive (3, 0.25)
p3s_ruc = df_3step[df_3step["RUC"] > 0]
n_p = p3s_ruc.shape[0]

# Negative (3, 0.25)
n3s_ruc = df_3step[df_3step["RUC"] < 0]
n_n = n3s_ruc.shape[0]

x1 = np.array(range(0,n_p)).reshape(-1,1)
y1 = np.array(p3s_ruc["Total_Number_Incidents"])



# Residual derived from difference in predicted and true
# Logistic Regression with 70/30 train-test-split

x_train, x_test, y_train, y_test = train_test_split(x1,y1, test_size=0.3)

LR.fit(x_train, y_train)
y_hat1 = LR.predict(x_test)

# res = y1-y_hat1


# / We perform our machine learning models, and check its performance using the loss functions /
# We are doing a binary classification-type problem.
# We can study certain RUC values.? (confused)

In [None]:
# /Testing window /


# df_3step.to_csv('merged.csv', index = False)
# p3s_ruc.to_csv('merged.csv', index = False)

# p3s_ruc.head()
x1

In [None]:
df = df_3step[df_3step["Total_Number_Incidents"] > 0]
NAME_OF_PREDICTION_VARIABLE = "RUC"
PERCENT_OF_ACCIDENTS_DETECTED = 0.95
ruc = round(df.quantile(PERCENT_OF_ACCIDENTS_DETECTED)[NAME_OF_PREDICTION_VARIABLE], 3)
print(f"We expect to be able to detect {PERCENT_OF_ACCIDENTS_DETECTED * 100}% of accidents if we treat all RUCs higher than {ruc} as accidents.")

**Loss Functions and M-estimators**