In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./dataset/cleaned_accidents.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7494396 entries, 0 to 7494395
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Severity            float64
 1   Start_Lat           float64
 2   Start_Lng           float64
 3   Distance(mi)        float64
 4   Wind_Direction      object 
 5   Weather_Condition   object 
 6   Amenity             float64
 7   Bump                float64
 8   Crossing            float64
 9   Give_Way            float64
 10  Junction            float64
 11  No_Exit             float64
 12  Railway             float64
 13  Roundabout          float64
 14  Station             float64
 15  Stop                float64
 16  Traffic_Calming     float64
 17  Traffic_Signal      float64
 18  Sunrise_Sunset      object 
 19  Hour                float64
 20  Weekday             float64
 21  Time_Duration(min)  float64
dtypes: float64(19), object(3)
memory usage: 1.2+ GB


In [4]:
# columns with missing values - we dont have missing values
for col in df.columns:
    print(col, df[col].isnull().sum())

Severity 0
Start_Lat 0
Start_Lng 0
Distance(mi) 0
Wind_Direction 0
Weather_Condition 0
Amenity 0
Bump 0
Crossing 0
Give_Way 0
Junction 0
No_Exit 0
Railway 0
Roundabout 0
Station 0
Stop 0
Traffic_Calming 0
Traffic_Signal 0
Sunrise_Sunset 0
Hour 0
Weekday 0
Time_Duration(min) 0


In [5]:
# object and categorical columns
cat_cols = []
for col in df.columns:
    if df[col].dtype == 'object':
        cat_cols.append(col)

In [6]:
for col in cat_cols:
    print(col, len(df[col].unique()))

Wind_Direction 24
Weather_Condition 143
Sunrise_Sunset 2


In [7]:
# Frequency encoding for categorical columns
def frequency_encode(df, column):
    freq_map = df[column].value_counts(normalize=True)
    df[f"{column}_freq"] = df[column].map(freq_map)
    return df

# Example usage
df = frequency_encode(df, 'Wind_Direction')
df = frequency_encode(df, 'Weather_Condition')


In [8]:
# one hot encoding for Sunrise_Sunset
df = pd.get_dummies(df, columns=['Sunrise_Sunset'], drop_first=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7494396 entries, 0 to 7494395
Data columns (total 24 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Severity                float64
 1   Start_Lat               float64
 2   Start_Lng               float64
 3   Distance(mi)            float64
 4   Wind_Direction          object 
 5   Weather_Condition       object 
 6   Amenity                 float64
 7   Bump                    float64
 8   Crossing                float64
 9   Give_Way                float64
 10  Junction                float64
 11  No_Exit                 float64
 12  Railway                 float64
 13  Roundabout              float64
 14  Station                 float64
 15  Stop                    float64
 16  Traffic_Calming         float64
 17  Traffic_Signal          float64
 18  Hour                    float64
 19  Weekday                 float64
 20  Time_Duration(min)      float64
 21  Wind_Direction_freq     float64

In [10]:
# drop columns
df = df.drop(["Wind_Direction", "Weather_Condition"], axis=1)

In [11]:
df.head(10)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,...,Station,Stop,Traffic_Calming,Traffic_Signal,Hour,Weekday,Time_Duration(min),Wind_Direction_freq,Weather_Condition_freq,Sunrise_Sunset_Night
0,3.0,39.865147,-84.058723,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,18.0,0.0,360.0,0.048872,0.046817,True
1,2.0,39.928059,-82.831184,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,13.0,6.0,101.5,0.048872,0.046817,True
2,2.0,39.063148,-84.032608,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,6.0,3.0,28.9,0.048346,0.051085,True
3,3.0,39.747753,-84.205582,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,17.0,2.0,29.6,0.048346,0.134886,True
4,2.0,39.627781,-84.188354,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,11.0,3.0,180.0,0.048346,0.134886,False
5,3.0,40.10059,-82.925194,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,6.0,79.12,0.051024,0.046817,False
6,2.0,39.758274,-84.230507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,16.0,4.0,29.62,0.046949,0.051085,False
7,3.0,39.770382,-84.194901,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,18.0,5.0,135.55,0.046949,0.051085,False
8,2.0,39.778061,-84.172005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,13.0,6.0,150.5,0.048346,0.134886,False
9,3.0,40.10059,-82.925194,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,13.0,4.0,131.5,0.051024,0.046817,False


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7494396 entries, 0 to 7494395
Data columns (total 22 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Severity                float64
 1   Start_Lat               float64
 2   Start_Lng               float64
 3   Distance(mi)            float64
 4   Amenity                 float64
 5   Bump                    float64
 6   Crossing                float64
 7   Give_Way                float64
 8   Junction                float64
 9   No_Exit                 float64
 10  Railway                 float64
 11  Roundabout              float64
 12  Station                 float64
 13  Stop                    float64
 14  Traffic_Calming         float64
 15  Traffic_Signal          float64
 16  Hour                    float64
 17  Weekday                 float64
 18  Time_Duration(min)      float64
 19  Wind_Direction_freq     float64
 20  Weather_Condition_freq  float64
 21  Sunrise_Sunset_Night    bool   

In [13]:
# Standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = df.copy()

In [14]:
# save the cleaned dataset
df_scaled.to_csv('./dataset/encoded_accidents.csv', index=False)