In [1]:
# generic data science libraries
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import scikit-learn
from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [4]:
df = pd.read_csv("..\data\DallasAccidents.csv")
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,Number,Street,Side,City,County,State,Zipcode,Country,Timezone,Airport_Code,Weather_Timestamp,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-261014,MapQuest,201.0,2,2016-11-30 16:10:04,2016-11-30 17:25:00,32.662193,-96.943153,,,0.01,Accident on Camp Wisdom Rd at Clark Rd.,,Clark Rd,R,Dallas,Dallas,TX,75249,US,US/Central,KRBD,2016-11-30 15:53:00,60.1,,24.0,30.0,10.0,Variable,5.8,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
1,A-261015,MapQuest,201.0,3,2016-11-30 16:05:32,2016-11-30 17:24:00,32.77879,-96.782021,,,0.01,#2 / #3 lane blocked due to accident on I-30 E...,,US-75 S,R,Dallas,Dallas,TX,75226,US,US/Central,KDAL,2016-11-30 15:53:00,61.0,,22.0,30.01,10.0,NNW,4.6,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
2,A-261016,MapQuest,201.0,2,2016-11-30 16:10:46,2016-11-30 17:27:00,32.724277,-96.762245,,,0.0,Accident on Julius Schepps Fwy Northbound at O...,,I-45 S,R,Dallas,Dallas,TX,75215,US,US/Central,KDAL,2016-11-30 15:53:00,61.0,,22.0,30.01,10.0,NNW,4.6,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
3,A-261017,MapQuest,201.0,2,2016-11-30 15:45:59,2016-11-30 17:18:00,32.708355,-96.700043,,,0.0,Accident on Jim Miller Rd between Carter Rd an...,500.0,S Jim Miller Rd,L,Dallas,Dallas,TX,75217,US,US/Central,KHQZ,2016-11-30 15:50:00,59.0,,23.0,30.02,10.0,WNW,10.4,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
4,A-261018,MapQuest,201.0,3,2016-11-30 16:06:04,2016-11-30 17:20:42,32.864021,-96.66114,,,0.01,HOV lane blocked due to accident on I-635 East...,,Northwest Hwy,R,Dallas,Dallas,TX,75228,US,US/Central,KDAL,2016-11-30 15:53:00,61.0,,22.0,30.01,10.0,NNW,4.6,,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90436 entries, 0 to 90435
Data columns (total 49 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     90436 non-null  object 
 1   Source                 90436 non-null  object 
 2   TMC                    66866 non-null  float64
 3   Severity               90436 non-null  int64  
 4   Start_Time             90436 non-null  object 
 5   End_Time               90436 non-null  object 
 6   Start_Lat              90436 non-null  float64
 7   Start_Lng              90436 non-null  float64
 8   End_Lat                23570 non-null  float64
 9   End_Lng                23570 non-null  float64
 10  Distance(mi)           90436 non-null  float64
 11  Description            90436 non-null  object 
 12  Number                 23271 non-null  float64
 13  Street                 90436 non-null  object 
 14  Side                   90436 non-null  object 
 15  Ci

In [10]:
df.columns

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)',
       'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [11]:
df.nunique()

ID                       90436
Source                       3
TMC                         18
Severity                     4
Start_Time               86380
End_Time                 87018
Start_Lat                26178
Start_Lng                23612
End_Lat                   8328
End_Lng                   8326
Distance(mi)              2303
Description              48894
Number                    5199
Street                    3957
Side                         2
City                        26
County                       1
State                        1
Zipcode                   5999
Country                      1
Timezone                     1
Airport_Code                 8
Weather_Timestamp        43304
Temperature(F)             478
Wind_Chill(F)              409
Humidity(%)                 94
Pressure(in)               209
Visibility(mi)              34
Wind_Direction              24
Wind_Speed(mph)             64
Precipitation(in)           98
Weather_Condition           58
Amenity 

In [None]:
obj_cols_to_dum = ['Amenity', 
    'Bump', 
    'Crossing',
    'Give_Way',
    'Junction', 
    'No_Exit', 
    'Railway', 
    'Roundabout', 
    'Station',
    'Stop', 
    'Traffic_Calming', 
    'Traffic_Signal', 
    'Turning_Loop',
    'Side',
    'Sunrise_Sunset', 
    'Civil_Twilight', 
    'Nautical_Twilight',
    'Astronomical_Twilight']