In [41]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv('US_Accidents_March23.csv', nrows = 5000000)

In [42]:
df.drop(['ID','Source','Start_Time','End_Time','Start_Lat','Start_Lng','End_Lat','End_Lng',
        'Description','Street','Country','Timezone','Airport_Code','Weather_Timestamp','Turning_Loop'], axis=1, inplace=True)

In [43]:
df.sample(5)

Unnamed: 0,Severity,Distance(mi),City,County,State,Zipcode,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
4621947,2,1.509,Los Angeles,Los Angeles,CA,90045,43.0,43.0,38.0,30.11,10.0,E,3.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,Night,Day,Day,Day
2514968,2,0.0,Baton Rouge,East Baton Rouge,LA,70802,66.9,,93.0,30.17,4.0,NNE,8.1,,Overcast,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
884366,3,0.0,Atlanta,Fulton,GA,30318,74.0,74.0,97.0,29.2,7.0,CALM,0.0,0.02,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night
4609382,2,0.113,Stanton,Orange,CA,90680-2335,55.0,55.0,31.0,30.17,10.0,CALM,0.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
2901525,2,0.0,Lafayette,Lafayette,LA,70501-2839,59.0,,69.0,29.98,10.0,NW,12.7,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 31 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Severity               int64  
 1   Distance(mi)           float64
 2   City                   object 
 3   County                 object 
 4   State                  object 
 5   Zipcode                object 
 6   Temperature(F)         float64
 7   Wind_Chill(F)          float64
 8   Humidity(%)            float64
 9   Pressure(in)           float64
 10  Visibility(mi)         float64
 11  Wind_Direction         object 
 12  Wind_Speed(mph)        float64
 13  Precipitation(in)      float64
 14  Weather_Condition      object 
 15  Amenity                bool   
 16  Bump                   bool   
 17  Crossing               bool   
 18  Give_Way               bool   
 19  Junction               bool   
 20  No_Exit                bool   
 21  Railway                bool   
 22  Roundabout        

In [45]:
## Separating into cat, num, bool variables

df_cat = [i for i in df.columns if df[i].dtype=='object']
df_num = [i for i in df.columns if df[i].dtype!='object' and df[i].dtype != 'bool' and i!='Severity']
df_bool = [i for i in df.columns if df[i].dtype == 'bool']

In [46]:
df_cat

['City',
 'County',
 'State',
 'Zipcode',
 'Wind_Direction',
 'Weather_Condition',
 'Sunrise_Sunset',
 'Civil_Twilight',
 'Nautical_Twilight',
 'Astronomical_Twilight']

In [47]:
df_num

['Distance(mi)',
 'Temperature(F)',
 'Wind_Chill(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Speed(mph)',
 'Precipitation(in)']

In [48]:
df_bool

['Amenity',
 'Bump',
 'Crossing',
 'Give_Way',
 'Junction',
 'No_Exit',
 'Railway',
 'Roundabout',
 'Station',
 'Stop',
 'Traffic_Calming',
 'Traffic_Signal']

In [49]:
## Missing values in int

missing_percent_int = df[df_num].isnull().mean() * 100

missing_data_int = missing_percent_int[missing_percent_int > 0]
missing_data_int = missing_data_int.sort_values(ascending=False)

In [50]:
missing_data_int

Precipitation(in)    36.80616
Wind_Chill(F)        34.15814
Wind_Speed(mph)       8.84290
Visibility(mi)        2.11220
Humidity(%)           2.00596
Temperature(F)        1.88228
Pressure(in)          1.61346
dtype: float64

In [51]:
## Missing values in categorical

missing_percent_obj = df[df_cat].isnull().mean() * 100

missing_data_obj = missing_percent_obj[missing_percent_obj > 0]
missing_data_obj = missing_percent_obj.sort_values(ascending=False)

In [52]:
missing_data_obj

Weather_Condition        2.05366
Wind_Direction           1.93016
Sunrise_Sunset           0.26266
Civil_Twilight           0.26266
Nautical_Twilight        0.26266
Astronomical_Twilight    0.26266
Zipcode                  0.01688
City                     0.00272
County                   0.00000
State                    0.00000
dtype: float64

In [53]:
## Missing values in categorical

missing_percent_bool = df[df_bool].isnull().mean() * 100

missing_percent_bool = missing_percent_bool[missing_percent_bool > 0]
missing_percent_bool = missing_percent_bool.sort_values(ascending=False)

In [54]:
missing_percent_bool

Series([], dtype: float64)

In [55]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [56]:
df[df_cat].head()

Unnamed: 0,City,County,State,Zipcode,Wind_Direction,Weather_Condition,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,Dayton,Montgomery,OH,45424,Calm,Light Rain,Night,Night,Night,Night
1,Reynoldsburg,Franklin,OH,43068-3402,Calm,Light Rain,Night,Night,Night,Day
2,Williamsburg,Clermont,OH,45176,SW,Overcast,Night,Night,Day,Day
3,Dayton,Montgomery,OH,45417,SW,Mostly Cloudy,Night,Day,Day,Day
4,Dayton,Montgomery,OH,45459,SW,Mostly Cloudy,Day,Day,Day,Day


In [57]:
df[df_num].head()

Unnamed: 0,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in)
0,0.01,36.9,,91.0,29.68,10.0,,0.02
1,0.01,37.9,,100.0,29.65,10.0,,0.0
2,0.01,36.0,33.3,100.0,29.67,10.0,3.5,
3,0.01,35.1,31.0,96.0,29.64,9.0,4.6,
4,0.01,36.0,33.3,89.0,29.65,6.0,3.5,


In [58]:
df[df_bool].head()

Unnamed: 0,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True


In [59]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """
        Custom transformer to perform frequency encoding on columns.
        """
        self.frequency_maps = {}

    def fit(self, X, y=None):
        """
        Compute frequency maps for each column.
        :param X: Input DataFrame or array.
        :param y: Ignored.
        :return: self
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        for column in X.columns:
            self.frequency_maps[column] = X[column].value_counts().to_dict()
        return self

    def transform(self, X):
        """
        Transform columns by replacing values with their frequency.
        :param X: Input DataFrame or array.
        :return: Transformed DataFrame
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X_transformed = X.copy()
        for column in X.columns:
            X_transformed[column] = X_transformed[column].map(self.frequency_maps[column])
        return X_transformed.values  # Return as array for compatibility


In [60]:
# Numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean'))
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('frequency_encode', FrequencyEncoder())
])

# Boolean pipeline
boolean_pipeline = Pipeline(steps=[
    ('binary_encode', OneHotEncoder())  # Binary encoding for boolean values
])

# ColumnTransformer combining all pipelines
preprocessor = ColumnTransformer(transformers=[
    ('numerical_pipeline', numerical_pipeline, df_num),   # Replace df_num with actual numerical column names
    ('categorical_pipeline', categorical_pipeline, df_cat),  # Replace df_cat with actual categorical column names
    ('boolean_pipeline', boolean_pipeline, df_bool)  # Replace df_bool with actual boolean column names
])

In [61]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2)

In [62]:
y_train = df_train[['Severity']]
X_train = df_train.drop(['Severity'],axis=1)

y_test = df_test[['Severity']]
X_test = df_test.drop(['Severity'],axis=1)

In [63]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [64]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

In [65]:
dt.fit(X_train, y_train)

In [66]:
y_pred = dt.predict(X_test)

In [67]:
from sklearn.metrics import recall_score, classification_report

In [68]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.14      0.15      0.14      7857
           2       0.88      0.86      0.87    740435
           3       0.64      0.70      0.67    235719
           4       0.19      0.19      0.19     15989

    accuracy                           0.80   1000000
   macro avg       0.46      0.47      0.47   1000000
weighted avg       0.81      0.80      0.80   1000000

