In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


In [11]:
df = pd.read_csv("data.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,CASENUM,PEDS,PERMVIT,VE_TOTAL,MONTH,DAY_WEEK,YEAR,HOUR,HARM_EV,...,VPROFILE,VSURCOND,AGE,SEX,AIR_BAG,EJECTION,ALCHL,DRUGS,OVER_SPD,IS_FATAL
0,403347,202204414651,0,2,4,10,7,2022,20.0,12.0,...,1.0,1.0,43.0,1.0,8.0,0.0,1.0,,,0
1,292961,202103019502,0,1,1,2,4,2021,16.0,42.0,...,5.0,4.0,35.0,1.0,20.0,0.0,0.0,0.0,,0
2,355890,220480,0,3,3,7,3,2021,5.0,12.0,...,1.0,1.0,65.0,1.0,1.0,0.0,,,8.0,1
3,2909,201700044626,0,3,2,2,7,2017,12.0,12.0,...,2.0,1.0,54.0,1.0,20.0,0.0,0.0,,0.0,0
4,241064,202002608370,0,1,1,7,5,2020,7.0,34.0,...,1.0,1.0,66.0,1.0,20.0,0.0,0.0,,,0


In [None]:
TARGET = "IS_FATAL"

X = df.drop(TARGET, axis=1)
y = df[TARGET]

# num_cols = X.columns.tolist()
cat_columns = [
    'MONTH', 'DAY_WEEK', 'YEAR', 'HOUR', 'HARM_EV', 'MANCOL', 'RELJCT', 
    'TYP_INT', 'REL_ROAD', 'WRK_ZONE', 'LGT_COND', 'WEATHER', 'MDL_YEAR',
    'MAKE', 'MODEL', 'BODY_TYP', 'J_KNIFE', 'ROLLOVER', 'ROLINLOC', 'VALIGN',
    'VPROFILE', 'VSURCOND', 'SEX', 'AIR_BAG', 'EJECTION', 'ALCHL', 'DRUGS'
]
num_columns = [
    'PEDS', 'VE_TOTAL', 'PERMVIT', 'NUMOCCS', 'SPEED', 'SPEEDLIM', 'AGE',
    'OVER_SPD'
]

num_cols, cat_cols


(['Unnamed: 0',
  'CASENUM',
  'PEDS',
  'PERMVIT',
  'VE_TOTAL',
  'MONTH',
  'DAY_WEEK',
  'YEAR',
  'HOUR',
  'HARM_EV',
  'MANCOL',
  'RELJCT',
  'TYP_INT',
  'REL_ROAD',
  'WRK_ZONE',
  'LGT_COND',
  'WEATHER',
  'NUMOCCS',
  'MDL_YEAR',
  'MAKE',
  'MODEL',
  'BODY_TYP',
  'J_KNIFE',
  'SPEED',
  'ROLLOVER',
  'ROLINLOC',
  'SPEEDLIM',
  'VALIGN',
  'VPROFILE',
  'VSURCOND',
  'AGE',
  'SEX',
  'AIR_BAG',
  'EJECTION',
  'ALCHL',
  'DRUGS',
  'OVER_SPD'],
 [])

In [None]:
def bootstrap_sample(X, y):
    n = len(X)
    indices = np.random.choice(n, size=n, replace=True)

    X_boot = X.iloc[indices]
    y_boot = y.iloc[indices]

    oob_indices = np.setdiff1d(np.arange(n), indices)
    X_oob = X.iloc[oob_indices]
    y_oob = y.iloc[oob_indices]

    return X_boot, y_boot, X_oob, y_oob



In [None]:
lr_preprocess = ColumnTransformer(
    transformers = [
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
    ],
    remainder='drop'
)

knn_preprocess = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_columns),
        ('cat_svd', Pipeline([
            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
            ('svd', TruncatedSVD(n_components=50, random_state=42))
        ]), cat_columns)
    ],
    remainder='drop'
)

In [21]:
log_reg_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=200))
])


In [22]:
knn_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('model', KNeighborsClassifier())
])


In [23]:
X_boot, y_boot, X_oob, y_oob = bootstrap_sample(X, y)

results = {}

# Logistic Regression
log_reg_pipeline.fit(X_boot, y_boot)
y_pred_lr = log_reg_pipeline.predict(X_oob)

results["Logistic Regression Accuracy"] = accuracy_score(y_oob, y_pred_lr)
print("Logistic Regression:\n")
print(classification_report(y_oob, y_pred_lr))

# KNN
knn_pipeline.fit(X_boot, y_boot)
y_pred_knn = knn_pipeline.predict(X_oob)

results["KNN Accuracy"] = accuracy_score(y_oob, y_pred_knn)
print("\nKNN:\n")
print(classification_report(y_oob, y_pred_knn))

results


Logistic Regression:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     99325
           1       1.00      0.95      0.97     42183

    accuracy                           0.98    141508
   macro avg       0.99      0.97      0.98    141508
weighted avg       0.98      0.98      0.98    141508


KNN:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     99325
           1       0.97      0.94      0.95     42183

    accuracy                           0.97    141508
   macro avg       0.97      0.96      0.97    141508
weighted avg       0.97      0.97      0.97    141508



{'Logistic Regression Accuracy': 0.9844037086242474,
 'KNN Accuracy': 0.9712454419538118}

In [24]:
pd.DataFrame(results, index=["Accuracy"])


Unnamed: 0,Logistic Regression Accuracy,KNN Accuracy
Accuracy,0.984404,0.971245
