In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


In [2]:
data_df = pd.read_csv('columbus_accidents_regions 2.csv')

In [3]:
display(data_df.describe())

Unnamed: 0.1,Unnamed: 0,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Centroid_Lat,Centroid_Lng
count,22216.0,22216.0,22216.0,22216.0,8142.0,8142.0,22216.0,22159.0,16194.0,22058.0,22169.0,22142.0,21018.0,13988.0,22216.0,22216.0
mean,2937866.0,2.450126,39.807078,-83.691153,39.816264,-84.847632,0.570461,51.816142,43.649846,71.960831,29.475774,8.550788,8.730988,0.013672,39.807166,-83.691575
std,2254165.0,0.595986,1.758665,4.395942,2.508214,6.862262,1.414307,20.472849,24.807579,20.003013,0.708494,2.678965,5.088667,0.065392,1.758717,4.396167
min,10.0,1.0,29.663366,-109.424154,29.691592,-109.671261,0.0,-20.0,-41.0,8.0,20.06,0.0,0.0,0.0,29.667046,-109.421361
25%,995392.2,2.0,39.943413,-83.036194,39.94182,-83.09561,0.0,36.0,26.0,57.0,29.1,8.0,5.0,0.0,39.943896,-83.040012
50%,2474712.0,2.0,39.975233,-82.991562,39.976735,-83.000197,0.01,52.0,41.0,75.0,29.44,10.0,8.0,0.0,39.977,-82.990403
75%,4354020.0,3.0,40.036819,-82.934802,40.059882,-82.938752,0.531,69.0,64.0,89.0,30.02,10.0,12.0,0.0,40.038969,-82.934428
max,7727027.0,4.0,48.822446,-74.653181,48.836445,-74.653181,40.476,99.0,99.0,100.0,30.76,19.0,37.0,2.27,48.812377,-74.640387


In [4]:
categorical_columns = data_df.select_dtypes(include=['object', 'category']).columns

In [5]:
display(categorical_columns)

Index(['ID', 'Source', 'Start_Time', 'End_Time', 'Description', 'Street',
       'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Wind_Direction',
       'Weather_Condition', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight', 'Region', 'Neighbor_1',
       'Neighbor_2', 'Neighbor_3', 'Neighbor_4', 'Neighbor_5', 'Neighbor_6',
       'Neighbor_7'],
      dtype='object')

In [13]:
columns_to_remove = ['Precipitation(in)', 'Wind_Chill(F)', 'End_Lat', 'End_Lng','Airport_Code']
data_df.drop(columns=list(columns_to_remove), inplace=True)

In [14]:
columns_to_fill = [
    'Wind_Speed(mph)', 
    'Temperature(F)', 
    'Humidity(%)',
    'Pressure(in)',
    'Visibility(mi)'
]

# Replace missing values with the mean for each column in the list
for column in columns_to_fill:
    mean_value = data_df[column].mean()
    data_df[column] = data_df[column].fillna(mean_value)

In [15]:
categorical_columns_to_fill = [
    'Weather_Timestamp', 
    'Wind_Direction', 
    'Weather_Condition', 
    'Sunrise_Sunset',
    'Civil_Twilight',
    'Nautical_Twilight',
    'Astronomical_Twilight',
    'Street'
]
# Replace missing values with the mode for each categorical column in the list
for column in categorical_columns_to_fill:
    mode_value = data_df[column].mode()[0]  # mode() returns a Series, get the first value
    data_df[column] = data_df[column].fillna(mode_value)

In [16]:
# Normalize numerical data
categorical_features = data_df.select_dtypes(exclude=[np.number]).columns.tolist()
numeric_features = data_df.select_dtypes(include=[np.number]).columns.tolist()


# Encode categorical features using oneHotEncoder, and standardize numerical features using StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])



In [22]:
data_df_transformed = preprocessor.fit_transform(data_df)
display(data_df_transformed)


<22216x100151 sparse matrix of type '<class 'numpy.float64'>'
	with 1155111 stored elements in Compressed Sparse Row format>

In [23]:

data_df.to_csv('cleaned_data.csv', index=False)