In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
# Load dataset
df = pd.read_csv(r"C:\Users\Olym\Downloads\Data_Analysis_Self_Projects\Traffic_Analysis\Regulatory Affairs of Road Accident Data 2020 India_Logistic_Regression.csv")

# See first rows
print(df.head())
print(df.info())

  Million Plus Cities   Cause category Cause Subcategory  \
0                Agra  Traffic Control           Blinker   
1                Agra  Traffic Control           Blinker   
2                Agra  Traffic Control           Blinker   
3                Agra  Traffic Control           Blinker   
4                Agra  Traffic Control           Blinker   

         Outcome of Incident  Count  
0         Greviously Injured    0.0  
1               Minor Injury    0.0  
2             Persons Killed    0.0  
3              Total Injured    0.0  
4  Total number of Accidents    0.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9550 entries, 0 to 9549
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Million Plus Cities  9550 non-null   object 
 1   Cause category       9550 non-null   object 
 2   Cause Subcategory    9550 non-null   object 
 3   Outcome of Incident  9550 non-null   object 
 4   

In [51]:
#Adding new column "Outcome_Code"
df['Outcome_Code'] = df['Outcome of Incident'].map({
    'Persons Killed': 1,       # Fatal
    'Greviously Injured': 0,   # Non-Fatal
    'Minor Injury': 0           # Non-Fatal
})

In [52]:
print(df['Outcome of Incident'].unique())
print(df['Outcome_Code'].value_counts())

['Greviously Injured' 'Minor Injury' 'Persons Killed' 'Total Injured'
 'Total number of Accidents']
Outcome_Code
0.0    4000
1.0    2000
Name: count, dtype: int64


In [53]:
# List of rows to remove
unwanted_rows = ['Total Injured', 'Total number of Accidents']

# Keep only rows that are NOT in the unwanted list
df = df[~df['Outcome of Incident'].isin(unwanted_rows)]

In [54]:
df['Outcome of Incident'].unique() #checking if the null values are removed

array(['Greviously Injured', 'Minor Injury', 'Persons Killed'],
      dtype=object)

In [55]:
#Dropping the 'Outcome of Incident' column as it's of no use after encoding
df = df.drop(columns=['Outcome of Incident', 'Count'])

In [56]:
df.head()

Unnamed: 0,Million Plus Cities,Cause category,Cause Subcategory,Outcome_Code
0,Agra,Traffic Control,Blinker,0.0
1,Agra,Traffic Control,Blinker,0.0
2,Agra,Traffic Control,Blinker,1.0
5,Agra,Traffic Control,Other,0.0
6,Agra,Traffic Control,Other,0.0


In [57]:
#Changing the data type of the feature columns into type 'str' so that it can be encoded properly
cols_to_encode = ['Million Plus Cities', 'Cause category', 'Cause Subcategory']

for col in cols_to_encode:
    df[col] = df[col].astype(str)

In [58]:
#One hot encoding the avlues in the feature columns
#df_encoded is the transformed version of df after applying one-hot encoding (or other preprocessing).
df_encoded = pd.get_dummies(df, columns=cols_to_encode)

In [59]:
df_encoded.head() #checking the data

Unnamed: 0,Outcome_Code,Million Plus Cities_Agra,Million Plus Cities_Ahmedabad,Million Plus Cities_Allahabad(Prayagraj),Million Plus Cities_Amritsar,Million Plus Cities_Asansol Durgapur,Million Plus Cities_Aurangabad,Million Plus Cities_Bengaluru,Million Plus Cities_Bhopal,Million Plus Cities_Chandigarh,...,Cause Subcategory_Round about Junction,Cause Subcategory_Staggered Junction,Cause Subcategory_Steep Grade,Cause Subcategory_Stop Sign,Cause Subcategory_Straight Road,Cause Subcategory_Sunny,Cause Subcategory_Traffic Light Signal,Cause Subcategory_Two Wheelers,Cause Subcategory_Uncontrolled,Cause Subcategory_Use of Mobile Phone
0,0.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,0.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,0.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


#### Preparing Features and Target

In [60]:
# X = all feature columns
# y = target column
X = df_encoded.drop('Outcome_Code', axis=1)  # here drop means all the columns except the one used for target column
y = df_encoded['Outcome_Code']

#### Splitting data into training and testing sets

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training the model

#### Baseline Logistic Regression without class weight balance (default settings)

In [76]:
# training the model
model = LogisticRegression (max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#Report of  how the model performs
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Default Threshold (0.5):")
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.6616666666666666
Default Threshold (0.5):
Confusion Matrix:
 [[794   0]
 [406   0]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.66      1.00      0.80       794
         1.0       0.00      0.00      0.00       406

    accuracy                           0.66      1200
   macro avg       0.33      0.50      0.40      1200
weighted avg       0.44      0.66      0.53      1200

ROC-AUC: 0.3594446029953717


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Logistic Regression with Class Weight Balanced (Threshold = 0.5)

In [70]:
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]
#y_pred_new = (y_prob >= 0.3).astype(int)  # try 0.3 instead of 0.5

In [71]:
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

print("Default Threshold (0.5):")
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))
# ROC AUC
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.4175
Default Threshold (0.5):
Confusion Matrix:
 [[358 436]
 [263 143]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.58      0.45      0.51       794
         1.0       0.25      0.35      0.29       406

    accuracy                           0.42      1200
   macro avg       0.41      0.40      0.40      1200
weighted avg       0.47      0.42      0.43      1200

ROC-AUC: 0.3594446029953717


#### Logistic Regression with Class Weight Balanced (Threshold = 0.3)
##### 1. Lowering the threshold usually improves recall for class 1 (catches more positives)
##### 2. It may reduce precision for class 0 (more false alarms).

In [73]:
# After changing the threshold to 0.3
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]
y_pred_new = (y_prob >= 0.3).astype(int)  # try 0.3 instead of 0.5

In [77]:
#Accuracy checking
print("Accuracy:", accuracy_score(y_test, y_pred_new))
print("Custom Threshold (0.3):")
# Confusion Matrix
print(confusion_matrix(y_test, y_pred_new))
# Classification Report
print(classification_report(y_test, y_pred_new))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_new))

Accuracy: 0.3383333333333333
Custom Threshold (0.3):
[[  0 794]
 [  0 406]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       794
         1.0       0.34      1.00      0.51       406

    accuracy                           0.34      1200
   macro avg       0.17      0.50      0.25      1200
weighted avg       0.11      0.34      0.17      1200

ROC-AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
