In [64]:
import pandas as pd
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [65]:
df = pd.read_csv('dataset_traffic_accident_prediction1.csv')

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Weather               798 non-null    object 
 1   Road_Type             798 non-null    object 
 2   Time_of_Day           798 non-null    object 
 3   Traffic_Density       798 non-null    float64
 4   Speed_Limit           798 non-null    float64
 5   Number_of_Vehicles    798 non-null    float64
 6   Driver_Alcohol        798 non-null    float64
 7   Accident_Severity     798 non-null    object 
 8   Road_Condition        798 non-null    object 
 9   Vehicle_Type          798 non-null    object 
 10  Driver_Age            798 non-null    float64
 11  Driver_Experience     798 non-null    float64
 12  Road_Light_Condition  798 non-null    object 
 13  Accident              798 non-null    float64
dtypes: float64(7), object(7)
memory usage: 92.0+ KB


In [67]:
df.isnull().sum()

Weather                 42
Road_Type               42
Time_of_Day             42
Traffic_Density         42
Speed_Limit             42
Number_of_Vehicles      42
Driver_Alcohol          42
Accident_Severity       42
Road_Condition          42
Vehicle_Type            42
Driver_Age              42
Driver_Experience       42
Road_Light_Condition    42
Accident                42
dtype: int64

In [68]:
df.head()

Unnamed: 0,Weather,Road_Type,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Road_Condition,Vehicle_Type,Driver_Age,Driver_Experience,Road_Light_Condition,Accident
0,Rainy,City Road,Morning,1.0,100.0,5.0,0.0,,Wet,Car,51.0,48.0,Artificial Light,0.0
1,Clear,Rural Road,Night,,120.0,3.0,0.0,Moderate,Wet,Truck,49.0,43.0,Artificial Light,0.0
2,Rainy,Highway,Evening,1.0,60.0,4.0,0.0,Low,Icy,Car,54.0,52.0,Artificial Light,0.0
3,Clear,City Road,Afternoon,2.0,60.0,3.0,0.0,Low,Under Construction,Bus,34.0,31.0,Daylight,0.0
4,Rainy,Highway,Morning,1.0,195.0,11.0,0.0,Low,Dry,Car,62.0,55.0,Artificial Light,1.0


In [69]:
# drop the missing val based on target column, Accident is our target column
df = df.dropna(subset=["Accident"])

In [70]:
df.head()

Unnamed: 0,Weather,Road_Type,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Road_Condition,Vehicle_Type,Driver_Age,Driver_Experience,Road_Light_Condition,Accident
0,Rainy,City Road,Morning,1.0,100.0,5.0,0.0,,Wet,Car,51.0,48.0,Artificial Light,0.0
1,Clear,Rural Road,Night,,120.0,3.0,0.0,Moderate,Wet,Truck,49.0,43.0,Artificial Light,0.0
2,Rainy,Highway,Evening,1.0,60.0,4.0,0.0,Low,Icy,Car,54.0,52.0,Artificial Light,0.0
3,Clear,City Road,Afternoon,2.0,60.0,3.0,0.0,Low,Under Construction,Bus,34.0,31.0,Daylight,0.0
4,Rainy,Highway,Morning,1.0,195.0,11.0,0.0,Low,Dry,Car,62.0,55.0,Artificial Light,1.0


In [71]:
X = df.drop('Accident',axis=1)
y = df['Accident']

In [72]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,train_size=0.8,random_state=42)

In [73]:
obj_cols = xtrain.select_dtypes(include="object")
# for cat cols, use most-frequent

In [74]:
impute = SimpleImputer(strategy='most_frequent')
xtrain[obj_cols.columns] = impute.fit_transform(obj_cols)  
# xtrain bcz we have to train the data

# column is more imp and has more missing values, use constant
# impute = SimpleImputer(strategy='constant',fill_value="N/A")

# objdf = impute.fit_transform(obj_cols)



# impute = SimpleImputer(strategy='constant',fill_value=0)

# objdf = impute.fit_transform(obj_cols)

In [82]:
xtrain.isnull().sum()

Weather                  0
Road_Type                0
Time_of_Day              0
Traffic_Density         36
Speed_Limit             33
Number_of_Vehicles      31
Driver_Alcohol          34
Accident_Severity        0
Road_Condition           0
Vehicle_Type             0
Driver_Age              31
Driver_Experience       34
Road_Light_Condition     0
dtype: int64

In [None]:
# impute.get_feature_names_out()

In [76]:
# objdf

In [78]:
impute.statistics_

array(['Clear', 'Highway', 'Afternoon', 'Low', 'Dry', 'Car',
       'Artificial Light'], dtype=object)

In [80]:
xtest[obj_cols.columns] = impute.transform(xtest[obj_cols.columns])

In [81]:
xtest

Unnamed: 0,Weather,Road_Type,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Road_Condition,Vehicle_Type,Driver_Age,Driver_Experience,Road_Light_Condition
726,Rainy,City Road,Morning,2.0,50.0,4.0,0.0,Low,Dry,Car,58.0,58.0,Daylight
696,Clear,Highway,Morning,2.0,60.0,5.0,0.0,Low,Wet,Motorcycle,42.0,39.0,Daylight
65,Clear,Highway,Afternoon,1.0,80.0,3.0,1.0,Low,Dry,Car,56.0,48.0,No Light
558,Snowy,Highway,Night,0.0,100.0,2.0,0.0,Moderate,Dry,Car,40.0,40.0,Daylight
68,Clear,Highway,Afternoon,2.0,80.0,3.0,0.0,Low,Dry,Car,52.0,43.0,Artificial Light
...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,Clear,City Road,Morning,1.0,100.0,4.0,,Low,Icy,Car,53.0,48.0,Artificial Light
838,Foggy,Highway,Afternoon,2.0,60.0,3.0,0.0,Low,Dry,Car,25.0,19.0,Artificial Light
782,Clear,Rural Road,Afternoon,0.0,80.0,5.0,0.0,Moderate,Icy,Car,,,No Light
538,Clear,City Road,Morning,1.0,60.0,3.0,0.0,Moderate,Dry,Truck,23.0,14.0,Daylight


# use one hot encoder here next

# column transform, pipeline, svm