In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:

# Load data
df = pd.read_csv('final_cleaned_traffic_data.csv')
df.head()

Unnamed: 0,Weather,Road_Type,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Road_Condition,Vehicle_Type,Driver_Age,Driver_Experience,Road_Light_Condition,Accident
0,Rainy,City Road,Morning,1.0,100.0,5.0,0.0,Low,Wet,Car,51.0,48.0,Artificial Light,0.0
1,Clear,Rural Road,Night,1.001253,120.0,3.0,0.0,Moderate,Wet,Truck,49.0,43.0,Artificial Light,0.0
2,Rainy,Highway,Evening,1.0,60.0,4.0,0.0,Low,Icy,Car,54.0,52.0,Artificial Light,0.0
3,Clear,City Road,Afternoon,2.0,60.0,3.0,0.0,Low,Under Construction,Bus,34.0,31.0,Daylight,0.0
4,Rainy,Highway,Morning,1.0,195.0,11.0,0.0,Low,Dry,Car,62.0,55.0,Artificial Light,1.0


## Feature Encoding and Scaling

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 798 entries, 0 to 797
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Weather               798 non-null    object 
 1   Road_Type             798 non-null    object 
 2   Time_of_Day           798 non-null    object 
 3   Traffic_Density       798 non-null    float64
 4   Speed_Limit           798 non-null    float64
 5   Number_of_Vehicles    798 non-null    float64
 6   Driver_Alcohol        798 non-null    float64
 7   Accident_Severity     798 non-null    object 
 8   Road_Condition        798 non-null    object 
 9   Vehicle_Type          798 non-null    object 
 10  Driver_Age            798 non-null    float64
 11  Driver_Experience     798 non-null    float64
 12  Road_Light_Condition  798 non-null    object 
 13  Accident              798 non-null    float64
dtypes: float64(7), object(7)
memory usage: 87.4+ KB


# Categorical Features to Encode

## Nominal Categories (No Inherent Order)
- **Weather**: Rainy, Clear, Foggy, Snowy, Stormy
- **Road_Type**: City Road, Highway, Rural Road, Mountain Road
- **Road_Condition**: Dry, Wet, Icy, Under Construction
- **Vehicle_Type**: Car, Truck, Motorcycle, Bus
- **Road_Light_Condition**: Daylight, Artificial Light, No Light

## Ordinal Categories (Have Meaningful Order)
- **Time_of_Day**: Morning, Afternoon, Evening, Night
- **Accident_Severity**: Low, Moderate, High
- **Age_Group**: Young, Adult, Middle_Age, Senior, Elderly


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


In [5]:

# Define feature types
nominal_features = ['Weather', 'Road_Type', 'Road_Condition', 'Vehicle_Type', 'Road_Light_Condition']
ordinal_features = ['Time_of_Day']  
numerical_features = ['Traffic_Density', 'Speed_Limit', 'Number_of_Vehicles', 
                      'Driver_Alcohol', 'Driver_Age']

# Define ordinal feature orders
time_categories = ['Morning', 'Afternoon', 'Evening', 'Night']


In [6]:
# Create preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('ord', OrdinalEncoder(categories=[time_categories]), ordinal_features),
        ('nom', OneHotEncoder(drop='first'), nominal_features)
    ])


In [7]:

# Create and fit the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Remove Driver_Experience due to high correlation with Driver_Age
df_cleaned = df.drop(['Driver_Experience', 'Accident_Severity'], axis=1)


In [8]:

# Apply transformations
X = df_cleaned.drop('Accident', axis=1)
y = df_cleaned['Accident']


X_transformed = pipeline.fit_transform(X)


In [9]:

# Generate feature names for transformed data
feature_names = []

# Numerical feature names (scaled)
feature_names.extend([f'{col}_scaled' for col in numerical_features])

# Ordinal feature name
feature_names.append('Time_of_Day_encoded')

# Get one-hot encoded feature names
ohe = preprocessor.named_transformers_['nom']
for feature, categories in zip(nominal_features, ohe.categories_):
    # Skip the first category because we used drop='first'
    for category in categories[1:]:
        feature_names.append(f'{feature}_{category}')
        
# Create DataFrame with proper column names
X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

# Add target variable back
final_df = X_transformed_df.copy()
final_df['Accident'] = y.values

# Save processed data to CSV
final_df.to_csv('processed_traffic_data.csv', index=False)
print("Processed data saved to 'processed_traffic_data.csv'")



Processed data saved to 'processed_traffic_data.csv'


In [11]:
pd.DataFrame(X_transformed, columns=feature_names).head()

Unnamed: 0,Traffic_Density_scaled,Speed_Limit_scaled,Number_of_Vehicles_scaled,Driver_Alcohol_scaled,Driver_Age_scaled,Time_of_Day_encoded,Weather_Foggy,Weather_Rainy,Weather_Snowy,Weather_Stormy,...,Road_Type_Mountain Road,Road_Type_Rural Road,Road_Condition_Icy,Road_Condition_Under Construction,Road_Condition_Wet,Vehicle_Type_Car,Vehicle_Type_Motorcycle,Vehicle_Type_Truck,Road_Light_Condition_Daylight,Road_Light_Condition_No Light
0,-0.014732,0.943602,0.874829,-0.420703,0.509335,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,-0.013104,1.583087,-0.142088,-0.420703,0.373888,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-0.014732,-0.335369,0.36637,-0.420703,0.712504,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.284294,-0.335369,-0.142088,-0.420703,-0.64196,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.014732,3.981158,3.925579,-0.420703,1.25429,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [10]:

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.25, random_state=42, stratify=y
)

# # If you want to save train and test sets separately
# train_df = pd.DataFrame(X_train, columns=feature_names)
# train_df['Accident'] = y_train.values
# train_df.to_csv('train_data.csv', index=False)

# test_df = pd.DataFrame(X_test, columns=feature_names)
# test_df['Accident'] = y_test.values
# test_df.to_csv('test_data.csv', index=False)

print("Train data shape:", X_train.shape)
print("Test data shape:", X_test.shape)
# print("Train-test sets saved to 'train_data.csv' and 'test_data.csv'")

# Summary of processed data
print("\nSummary of processed features:")
print(f"- {len(numerical_features)} numerical features (standardized)")
print(f"- {len(ordinal_features)} ordinal features (encoded)")
print(f"- {len(feature_names) - len(numerical_features) - len(ordinal_features)} one-hot encoded columns from {len(nominal_features)} nominal features")
print(f"Total feature count: {len(feature_names)}")

Train data shape: (598, 21)
Test data shape: (200, 21)

Summary of processed features:
- 5 numerical features (standardized)
- 1 ordinal features (encoded)
- 15 one-hot encoded columns from 5 nominal features
Total feature count: 21


In [13]:
X_train

array([[ 1.28429411,  0.9436017 , -1.15900482, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01473227,  0.30411622,  0.87482882, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01473227, -1.29459749, -0.142088  , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.31375865, -0.33536927,  0.87482882, ...,  0.        ,
         0.        ,  0.        ],
       [-1.31375865, -0.33536927, -1.15900482, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.28429411,  0.9436017 , -0.142088  , ...,  0.        ,
         0.        ,  0.        ]])

## Model Training And Model Selection

In [15]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [16]:
## Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor":AdaBoostRegressor(),
    "Graident BoostRegressor":GradientBoostingRegressor(),
    "Xgboost Regressor":XGBRegressor()
   
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.4492
- Mean Absolute Error: 0.4036
- R2 Score: 0.0378
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4615
- Mean Absolute Error: 0.4125
- R2 Score: -0.0143


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.4580
- Mean Absolute Error: 0.4195
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4583
- Mean Absolute Error: 0.4197
- R2 Score: -0.0000


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.4492
- Mean Absolute Error: 0.4041
- R2 Score: 0.0377
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4598
- Mean Absolute Error: 0.4115
- R2 Score: -0.0069


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.4115
- Mean Absolute Error: 0.3448
- R2 Score: 0.1928
--------------------

In [17]:
#Initialize few parameter for Hyperparamter tuning

rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}