Group members: Rayyan Khalil, Ayal Mashiack

Importing the dataset:


1.   Create a Kaggle account or Log in if you already have one
2.   Click the following link, this will take you to the full dataset's page on Kaggle: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents/data
3. Scroll down to the "Sampled Data (New!)" section, and follow the link present in that section, the link will take you to a google drive folder containing the sampled dataset we are using for our project
4. Download the sampled dataset named "US_Accidents_March23_sampled_500k.csv"
5. Ensure that the dataset csv file is placed in the same directory that the code is running from

** the dataset's csv file will also be attached with the final submission of the project





In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Assuming the dataset contains a file named 'train.csv'
# dataset = pd.read_csv('US_Accidents_March23.csv')
# print(dataset.head())
df = pd.read_csv('US_Accidents_March23_sampled_500k.csv', nrows=20000)
df.head()


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-2047758,Source2,2,2019-06-12 10:10:56,2019-06-12 10:55:58,30.641211,-91.153481,,,0.0,...,False,False,False,False,True,False,Day,Day,Day,Day
1,A-4694324,Source1,2,2022-12-03 23:37:14.000000000,2022-12-04 01:56:53.000000000,38.990562,-77.39907,38.990037,-77.398282,0.056,...,False,False,False,False,False,False,Night,Night,Night,Night
2,A-5006183,Source1,2,2022-08-20 13:13:00.000000000,2022-08-20 15:22:45.000000000,34.661189,-120.492822,34.661189,-120.492442,0.022,...,False,False,False,False,True,False,Day,Day,Day,Day
3,A-4237356,Source1,2,2022-02-21 17:43:04,2022-02-21 19:43:23,43.680592,-92.993317,43.680574,-92.972223,1.054,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-6690583,Source1,2,2020-12-04 01:46:00,2020-12-04 04:13:09,35.395484,-118.985176,35.395476,-118.985995,0.046,...,False,False,False,False,False,False,Night,Night,Night,Night


In [None]:
# finding all the columns from the dataset
df.columns

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

Since we are looking to rate severity of crashed based on the other factors in the accident we will save the severity column in a separate sample, then remove it from the main sample

In [None]:
# Cleaning out the dataset and removing irrelevant columns

columns = ['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Wind_Direction',
           'Temperature(F)', 'Visibility(mi)', 'Wind_Speed(mph)', # 'Weather_Condition',
           'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout',
           'Severity']
df_sample = df[columns]
df_sample.info()
df_sample.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Start_Lat        20000 non-null  float64
 1   Start_Lng        20000 non-null  float64
 2   End_Lat          11074 non-null  float64
 3   End_Lng          11074 non-null  float64
 4   Distance(mi)     20000 non-null  float64
 5   Wind_Direction   19573 non-null  object 
 6   Temperature(F)   19585 non-null  float64
 7   Visibility(mi)   19562 non-null  float64
 8   Wind_Speed(mph)  18536 non-null  float64
 9   Bump             20000 non-null  bool   
 10  Crossing         20000 non-null  bool   
 11  Give_Way         20000 non-null  bool   
 12  Junction         20000 non-null  bool   
 13  No_Exit          20000 non-null  bool   
 14  Railway          20000 non-null  bool   
 15  Roundabout       20000 non-null  bool   
 16  Severity         20000 non-null  int64  
dtypes: bool(7), 

Unnamed: 0,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Wind_Direction,Temperature(F),Visibility(mi),Wind_Speed(mph),Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Severity
0,30.641211,-91.153481,,,0.0,NW,77.0,10.0,5.0,False,False,False,False,False,False,False,2
1,38.990562,-77.39907,38.990037,-77.398282,0.056,W,45.0,10.0,5.0,False,False,False,False,False,False,False,2
2,34.661189,-120.492822,34.661189,-120.492442,0.022,W,68.0,10.0,13.0,False,False,False,False,False,False,False,2
3,43.680592,-92.993317,43.680574,-92.972223,1.054,ENE,27.0,10.0,15.0,False,False,False,False,False,False,False,2
4,35.395484,-118.985176,35.395476,-118.985995,0.046,CALM,42.0,10.0,0.0,False,False,False,False,False,False,False,2


In [None]:
# look into the number of nulls we have (sort the values to have the emptiest column on top)
nulls = df_sample.isnull().sum().sort_values(ascending=False)
nulls

End_Lat            8926
End_Lng            8926
Wind_Speed(mph)    1464
Visibility(mi)      438
Wind_Direction      427
Temperature(F)      415
Give_Way              0
Roundabout            0
Railway               0
No_Exit               0
Junction              0
Start_Lat             0
Crossing              0
Bump                  0
Start_Lng             0
Distance(mi)          0
Severity              0
dtype: int64

In [None]:
  # replace null values in End_Lat and End_Lng
# with values from the Start_Lat and Start_Lng

df_sample['End_Lat'] = df_sample['End_Lat'].fillna(df_sample['Start_Lat'])

df_sample['End_Lng'] = df_sample['End_Lng'].fillna(df_sample['Start_Lng'])

ws_val_sum = df_sample['Wind_Speed(mph)'].sum()
df_sample['Wind_Speed(mph)'].fillna(ws_val_sum.mean(), inplace=True)

vs_val_sum = df_sample['Visibility(mi)'].sum()
df_sample['Visibility(mi)'].fillna(vs_val_sum.mean(), inplace=True)
# df_sample['Visibility(mi)'] = df_sample['Visibility(mi)'].fillna((df_sample['Visibility(mi)'].isnull().sum()).mean())

tp_val_sum = df_sample['Temperature(F)'].sum()
df_sample['Temperature(F)'].fillna(tp_val_sum.mean(), inplace=True)
# df_sample['Temperature(F)'] = df_sample['Temperature(F)'].fillna((df_sample['Temperature(F)'].isnull().sum()).mean())

df_sample.fillna(method='ffill')

# print new count of nulls to make sure
# that values are assigned correctly

nulls2 = df_sample.isnull().sum().sort_values(ascending=False)
nulls2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['End_Lat'] = df_sample['End_Lat'].fillna(df_sample['Start_Lat'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['End_Lng'] = df_sample['End_Lng'].fillna(df_sample['Start_Lng'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['Wind_Speed(mph)'].fillna(ws_val_sum.mean(), inplace=True)
A value is trying to be set on a copy of a 

Wind_Direction     427
Start_Lat            0
Bump                 0
Roundabout           0
Railway              0
No_Exit              0
Junction             0
Give_Way             0
Crossing             0
Wind_Speed(mph)      0
Start_Lng            0
Visibility(mi)       0
Temperature(F)       0
Distance(mi)         0
End_Lng              0
End_Lat              0
Severity             0
dtype: int64

In [None]:
# Combine the text of the specified columns into a single string for each row
combined_text = df_sample['Wind_Direction'].fillna('')

df_sample = df_sample.drop('Wind_Direction', axis=1)

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the data
bow_matrix = vectorizer.fit_transform(combined_text)
# bow_matrix
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

df_sample = pd.concat([df_sample, bow_df.reset_index(drop=True)], axis=1)

In [None]:
nulls3 = df_sample.isnull().sum().sort_values(ascending=False)
nulls3

Start_Lat          0
south              0
ne                 0
nne                0
nnw                0
north              0
nw                 0
se                 0
sse                0
ene                0
ssw                0
sw                 0
var                0
variable           0
west               0
wnw                0
ese                0
east               0
Start_Lng          0
Bump               0
End_Lat            0
End_Lng            0
Distance(mi)       0
Temperature(F)     0
Visibility(mi)     0
Wind_Speed(mph)    0
Crossing           0
calm               0
Give_Way           0
Junction           0
No_Exit            0
Railway            0
Roundabout         0
Severity           0
wsw                0
dtype: int64

In [None]:
# testing to make sure this is correct
df_sample
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Start_Lat        20000 non-null  float64
 1   Start_Lng        20000 non-null  float64
 2   End_Lat          20000 non-null  float64
 3   End_Lng          20000 non-null  float64
 4   Distance(mi)     20000 non-null  float64
 5   Temperature(F)   20000 non-null  float64
 6   Visibility(mi)   20000 non-null  float64
 7   Wind_Speed(mph)  20000 non-null  float64
 8   Bump             20000 non-null  bool   
 9   Crossing         20000 non-null  bool   
 10  Give_Way         20000 non-null  bool   
 11  Junction         20000 non-null  bool   
 12  No_Exit          20000 non-null  bool   
 13  Railway          20000 non-null  bool   
 14  Roundabout       20000 non-null  bool   
 15  Severity         20000 non-null  int64  
 16  calm             20000 non-null  int64  
 17  east        

In [None]:
# separating the training sample (X) and the result to be compared to (y)
X = df_sample.drop('Severity', axis=1)
y = df_sample['Severity']

# we split the training and testing using the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

the 3 models chosen

In [None]:
# Initialize
linear_model = LinearRegression()
random_forest = RandomForestRegressor(random_state=42)
svr = SVR()

In [None]:
# scaling the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Fitting
linear_model.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
svr.fit(X_train, y_train)

# Make predictions without
y_pred_linear_model = linear_model.predict(X_test)
y_pred_random_forest = random_forest.predict(X_test)
y_pred_svr = svr.predict(X_test)

In [None]:
mae_linear_model = mean_absolute_error(y_test, y_pred_linear_model)
mae_random_forest = mean_absolute_error(y_test, y_pred_random_forest)
mae_svr = mean_absolute_error(y_test, y_pred_svr)

# Evaluate the model
print(f"Mean absolute Error linear model: {mae_linear_model}")
print(f"Mean absolute Error random forest: {mae_random_forest}")
print(f"Mean absolute Error svr: {mae_svr}")

Mean absolute Error linear model: 0.3434798732789988
Mean absolute Error random forest: 0.2965932777777778
Mean absolute Error svr: 0.28795059211708574


In [None]:
r2s_linear_model = r2_score(y_test, y_pred_linear_model)
r2s_random_forest = r2_score(y_test, y_pred_random_forest)
r2s_svr = r2_score(y_test, y_pred_svr)

# Evaluate the model
print(f"r2 score linear model: {r2s_linear_model}")
print(f"r2 score random forest: {r2s_random_forest}")
print(f"r2 score svr: {r2s_svr}")

r2 score linear model: 0.034483870693422625
r2 score random forest: 0.11417164984027461
r2 score svr: -0.04419876239717979


In [None]:
mse_linear_model = mean_squared_error(y_test, y_pred_linear_model)
mse_random_forest = mean_squared_error(y_test, y_pred_random_forest)
mse_svr = mean_squared_error(y_test, y_pred_svr)

# Evaluate the model
print(f"Mean Squared Error linear model: {mse_linear_model}")
print(f"Mean Squared Error random forest: {mse_random_forest}")
print(f"Mean Squared Error Error svr: {mse_svr}")

Mean Squared Error linear model: 0.22217360234000497
Mean Squared Error random forest: 0.20383675594444448
Mean Squared Error Error svr: 0.2402791559446773
