In [308]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [258]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [259]:
# df = pd.read_csv('/kaggle/input/is-the-traffic-collision-fatal/Train.csv')
df = pd.read_csv('drive/MyDrive/Datasets/Train.csv')

In [260]:
# displaying maximum columns
pd.set_option('display.max_columns', 200)

# Cleaning for model building

### Converting column names to lower case

In [261]:
df.columns = map(str.lower, df.columns)

In [262]:
df

Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid
0,3387730,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4,43.656345,-79.452490,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,,,South,"Automobile, Station Wagon",Turning Left,Failed to Yield Right of Way,Unknown,,,,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,1
1,3387731,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4,43.656345,-79.452490,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,,North,Other,,,,Vehicle turns left while ped crosses with ROW ...,Crossing with right of way,Unknown,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,2
2,3388101,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25,43.801943,-79.199786,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,,East,Motorcycle,Turning Right,Disobeyed Traffic Control,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,3
3,3389067,893184.0,2006,2006/01/01 05:00:00+00,236,WOODBINE AVE,O CONNOR DR,,Major Arterial,Toronto and East York,19,43.699595,-79.318797,Intersection,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,,,,,,,,,,,,,,,Yes,,,,,Yes,Yes,Yes,,Yes,,60,Woodbine-Lumsden,60,Woodbine-Lumsden (60),D55,4
4,3388102,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25,43.801943,-79.199786,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,,,South,"Automobile, Station Wagon",Going Ahead,Driving Properly,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,81474608,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,25 to 29,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14996
14996,81474609,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,0 to 4,Minor,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14997
14997,81474610,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,5 to 9,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14998
14998,81474611,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,5 to 9,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14999


### Displaying number of unique and missing values of all columns

In [263]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

              Feature  Unique Values  Missing Values
0              index_          15000               0
1              accnum           3822            3698
2                year             13               0
3                date           3082               0
4                time           1276               0
5             street1           1547               0
6             street2           2344            1343
7              offset            339           13072
8          road_class              9             357
9            district              4              14
10            wardnum             71               0
11           latitude           3475               0
12          longitude           3901               0
13           loccoord              7              90
14             accloc              9            5450
15           traffctl             10              29
16         visibility              8              14
17              light              9          

In [264]:
df.head(5)

Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid
0,3387730,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4,43.656345,-79.45249,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,,,South,"Automobile, Station Wagon",Turning Left,Failed to Yield Right of Way,Unknown,,,,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,1
1,3387731,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4,43.656345,-79.45249,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,,North,Other,,,,Vehicle turns left while ped crosses with ROW ...,Crossing with right of way,Unknown,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,2
2,3388101,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25,43.801943,-79.199786,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,,East,Motorcycle,Turning Right,Disobeyed Traffic Control,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,3
3,3389067,893184.0,2006,2006/01/01 05:00:00+00,236,WOODBINE AVE,O CONNOR DR,,Major Arterial,Toronto and East York,19,43.699595,-79.318797,Intersection,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,,,,,,,,,,,,,,,Yes,,,,,Yes,Yes,Yes,,Yes,,60,Woodbine-Lumsden,60,Woodbine-Lumsden (60),D55,4
4,3388102,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25,43.801943,-79.199786,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,,,South,"Automobile, Station Wagon",Going Ahead,Driving Properly,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,5


In [265]:
# Dropping those null records from column district and loccord where
# acclass is non-fatal, because for model training there are enough values
# for non-fatal injury but not enough for fatal

In [266]:
df['acclass'].value_counts()

acclass
Non-Fatal Injury    12978
Fatal                2022
Name: count, dtype: int64

In [267]:
df[(df['accnum'].isna())&(df['acclass'] == "Non-Fatal Injury")]


Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid
10440,81153095,,2013,2013/12/03 05:00:00+00,700,SCARLETT RD,LAWRENCE AV W,,Major Arterial,Etobicoke York,2,43.698177,-79.523266,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Non-Fatal Injury,Pedestrian Collisions,Driver,50 to 54,,,North,Pick Up Truck,Going Ahead,Driving Properly,Normal,,,,,,,Yes,,Yes,,,,,,,,,,,8,Humber Heights-Westmount,8,Humber Heights-Westmount (8),D23,10441
10449,81153096,,2013,2013/12/03 05:00:00+00,700,SCARLETT RD,LAWRENCE AV W,,Major Arterial,Etobicoke York,2,43.698177,-79.523266,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Non-Fatal Injury,Pedestrian Collisions,Pedestrian,10 to 14,Major,,,,,,,,,Inattentive,,,,Yes,,Yes,,,,,,,,,,,8,Humber Heights-Westmount,8,Humber Heights-Westmount (8),D23,10450
11000,80236768,,2015,2015/04/22 04:00:00+00,100,KINGSTON RD,COLUMBINE AVE,9 m West of,Major Arterial,Toronto and East York,19,43.671492,-79.310367,Intersection,Non Intersection,No Control,Rain,Dark,Wet,Non-Fatal Injury,SMV Unattended Vehicle,Driver,45 to 49,Major,,West,"Automobile, Station Wagon",Going Ahead,Lost control,Unknown,,,,,,,,,Yes,,,,,,,Yes,,,,63,The Beaches,63,The Beaches (63),D55,11001
11001,80247514,,2015,2015/08/12 04:00:00+00,1524,WILSON AVE,KELVIN AVE,,Major Arterial,Etobicoke York,7,43.715689,-79.531894,Intersection,At Intersection,Stop Sign,Clear,Daylight,Dry,Non-Fatal Injury,Cyclist Collisions,Driver,50 to 54,,,East,"Automobile, Station Wagon",Going Ahead,Driving Properly,Normal,,,,,,,,Yes,Yes,,,,,,,,,,,23,Pelmo Park-Humberlea,23,Pelmo Park-Humberlea (23),D31,11002
11002,80565532,,2015,2015/04/22 04:00:00+00,1402,3 RAINIER SQ,,4 m South of,Local,Scarborough,22,43.794242,-79.303012,Intersection,Private Driveway,Stop Sign,Clear,Daylight,Dry,Non-Fatal Injury,SMV Other,Driver,20 to 24,Minimal,,South,"Automobile, Station Wagon",Turning Left,Lost control,Normal,,,,,,,,,Yes,,,,,Yes,Yes,Yes,,,,148,East L'Amoreaux,117,L'Amoreaux (117),D42,11003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,81474608,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,25 to 29,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14996
14996,81474609,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,0 to 4,Minor,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14997
14997,81474610,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,5 to 9,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14998
14998,81474611,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,5 to 9,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14999


### Creating a new accnum column where if Time,Street1,Date,Year is same found same in rows then it wiil be considered as one accident with multiple parties involved, andd if the condition does not matches with the condition it means there is only one person involve in the accident, and those accidents will be given accnum with other method.

In [268]:
one_accident = df[df.duplicated(subset=['year','date','time','street1'], keep=False)]

one_accident['acc_num'] = one_accident.groupby(['year','date','time','street1']).ngroup()

df = pd.merge(df, one_accident[['acc_num']], left_index=True, right_index=True, how='left')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_accident['acc_num'] = one_accident.groupby(['year','date','time','street1']).ngroup()


In [269]:
# There are null values in acc_num where acclass is fatal, because these are the records where condition is not matching
df[(df["acc_num"].isna()) & (df["acclass"] == "Fatal")]

Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid,acc_num
659,3422670,905761.0,2006,2006/05/30 04:00:00+00,2328,ROSEDALE VALLEY RD,BAYVIEW AVE,,Minor Arterial,Toronto and East York,1113,43.671645,-79.36609,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,25 to 29,Fatal,,East,Motorcycle,Going Ahead,Lost control,Normal,,,,,,,,,,Yes,,,,,,Yes,,,,71,Cabbagetown-South St.James Town,71,Cabbagetown-South St.James Town (71),D51,660,
880,3450204,916024.0,2006,2006/07/30 04:00:00+00,2155,F G GARDINER XY Ramp W,GARDINER W S KINGSWAY RAMP,,,Toronto and East York,4,43.634645,-79.47159,Exit Ramp Westbound,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,20 to 24,Fatal,,West,Motorcycle,Going Ahead,Lost control,Unknown,,,,,,,,,,Yes,,,,,,,,,,85,South Parkdale,85,South Parkdale (85),D11,881,
1141,3490523,930890.0,2006,2006/10/18 04:00:00+00,2136,OLD FINCH AVE,REESOR RD,,Collector,Scarborough,25,43.824745,-79.19059,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,20 to 24,Fatal,,West,Motorcycle,Going Ahead,Lost control,Inattentive,,,,,,,,,,Yes,,,,,,,,,,144,Morningside Heights,131,Rouge (131),D42,1142,
2558,4211231,987376.0,2007,2007/08/30 04:00:00+00,2252,F G GARDINER XY W,JAMESON AVE,,,Toronto and East York,4,43.632745,-79.43379,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,25 to 29,Fatal,,West,Motorcycle,Going Ahead,Lost control,Normal,,,,,,,,,,Yes,,,,,,,,,,85,South Parkdale,85,South Parkdale (85),D14,2559,
2916,4273923,987500.0,2007,2007/09/02 04:00:00+00,48,SHEPPARD AVE W,SENTINEL RD,,Major Arterial,North York,6,43.743645,-79.49179,Intersection,,Traffic Signal,Clear,Dark,Dry,Fatal,SMV Other,Driver,40 to 44,Fatal,,East,"Automobile, Station Wagon",Going Ahead,Lost control,Unknown,,,,,,,,,Yes,,,,,,,,,,,155,Downsview,26,Downsview-Roding-CFB (26),D31,2917,
3903,5363175,1039861.0,2008,2008/05/24 04:00:00+00,30,BAYVIEW Aven,BLOOR Ramp,,,Toronto and East York,11,43.677146,-79.367893,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,25 to 29,Fatal,21.0,North,Motorcycle,Going Ahead,Exceeding Speed Limit,Unknown,,,,,,,,,,Yes,,,,,Yes,Yes,,,,98,Rosedale-Moore Park,98,Rosedale-Moore Park (98),D53,3904,
3990,5412194,1061341.0,2008,2008/09/28 04:00:00+00,545,WILSON AVE,HIGHWAY 400 S,,Major Arterial,Etobicoke York,7,43.718145,-79.52109,Intersection,At Intersection,No Control,Clear,Dark,Wet,Fatal,SMV Other,Driver,55 to 59,Fatal,42.0,East,"Automobile, Station Wagon",Going Ahead,Exceeding Speed Limit,Had Been Drinking,,,,,,,,,Yes,,,,,,Yes,Yes,,,,154,Oakdale-Beverley Heights,26,Downsview-Roding-CFB (26),D31,3991,
7524,7378164,1276186.0,2012,2012/01/20 05:00:00+00,2214,SENECA HILL DR,DON MILLS RD,,Collector,North York,17,43.790045,-79.35519,Mid-Block,,No Control,Snow,Dark,Packed Snow,Fatal,SMV Other,Driver,85 to 89,Fatal,4.0,West,"Automobile, Station Wagon",Going Ahead,Lost control,Unknown,,,,,,,,,Yes,,,,,,,,,,,47,Don Valley Village,47,Don Valley Village (47),D33,7525,
8694,7514335,1290105.0,2012,2012/04/20 04:00:00+00,1447,DON VALLEY PARKWAY N,GERRARD ST E,,,Toronto and East York,14,43.665145,-79.35589,Mid-Block,,No Control,Clear,Daylight,Dry,Fatal,SMV Other,Motorcycle Driver,35 to 39,Fatal,11.0,North,Motorcycle,Going Ahead,Exceeding Speed Limit,Unknown,,,,,,,,,,Yes,,,,,Yes,Yes,,,,68,North Riverdale,68,North Riverdale (68),D55,8695,
9470,7829849,1361106.0,2013,2013/06/16 04:00:00+00,616,JOE SHUSTER WAY,KING ST W,,Local,Toronto and East York,10,43.640547,-79.423999,Mid-Block,,No Control,Rain,Daylight,Wet,Fatal,SMV Other,Driver,40 to 44,Fatal,17.0,East,"Automobile, Station Wagon",Going Ahead,Other,Unknown,,,,,,,,,Yes,,,,,,,,,,,85,South Parkdale,85,South Parkdale (85),D14,9471,


In [270]:
# We can drop those records having null values in acc_num where acclass is non-fatal

In [271]:
to_drop2 = df[(df["acc_num"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index

df = df.drop(to_drop2, axis=0)

In [272]:
df['acc_num'].max()

4992.0

In [273]:
df[df['acc_num'].isna()]

Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid,acc_num
659,3422670,905761.0,2006,2006/05/30 04:00:00+00,2328,ROSEDALE VALLEY RD,BAYVIEW AVE,,Minor Arterial,Toronto and East York,1113,43.671645,-79.36609,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,25 to 29,Fatal,,East,Motorcycle,Going Ahead,Lost control,Normal,,,,,,,,,,Yes,,,,,,Yes,,,,71,Cabbagetown-South St.James Town,71,Cabbagetown-South St.James Town (71),D51,660,
880,3450204,916024.0,2006,2006/07/30 04:00:00+00,2155,F G GARDINER XY Ramp W,GARDINER W S KINGSWAY RAMP,,,Toronto and East York,4,43.634645,-79.47159,Exit Ramp Westbound,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,20 to 24,Fatal,,West,Motorcycle,Going Ahead,Lost control,Unknown,,,,,,,,,,Yes,,,,,,,,,,85,South Parkdale,85,South Parkdale (85),D11,881,
1141,3490523,930890.0,2006,2006/10/18 04:00:00+00,2136,OLD FINCH AVE,REESOR RD,,Collector,Scarborough,25,43.824745,-79.19059,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,20 to 24,Fatal,,West,Motorcycle,Going Ahead,Lost control,Inattentive,,,,,,,,,,Yes,,,,,,,,,,144,Morningside Heights,131,Rouge (131),D42,1142,
2558,4211231,987376.0,2007,2007/08/30 04:00:00+00,2252,F G GARDINER XY W,JAMESON AVE,,,Toronto and East York,4,43.632745,-79.43379,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,25 to 29,Fatal,,West,Motorcycle,Going Ahead,Lost control,Normal,,,,,,,,,,Yes,,,,,,,,,,85,South Parkdale,85,South Parkdale (85),D14,2559,
2916,4273923,987500.0,2007,2007/09/02 04:00:00+00,48,SHEPPARD AVE W,SENTINEL RD,,Major Arterial,North York,6,43.743645,-79.49179,Intersection,,Traffic Signal,Clear,Dark,Dry,Fatal,SMV Other,Driver,40 to 44,Fatal,,East,"Automobile, Station Wagon",Going Ahead,Lost control,Unknown,,,,,,,,,Yes,,,,,,,,,,,155,Downsview,26,Downsview-Roding-CFB (26),D31,2917,
3903,5363175,1039861.0,2008,2008/05/24 04:00:00+00,30,BAYVIEW Aven,BLOOR Ramp,,,Toronto and East York,11,43.677146,-79.367893,Mid-Block,,No Control,Clear,Dark,Dry,Fatal,SMV Other,Motorcycle Driver,25 to 29,Fatal,21.0,North,Motorcycle,Going Ahead,Exceeding Speed Limit,Unknown,,,,,,,,,,Yes,,,,,Yes,Yes,,,,98,Rosedale-Moore Park,98,Rosedale-Moore Park (98),D53,3904,
3990,5412194,1061341.0,2008,2008/09/28 04:00:00+00,545,WILSON AVE,HIGHWAY 400 S,,Major Arterial,Etobicoke York,7,43.718145,-79.52109,Intersection,At Intersection,No Control,Clear,Dark,Wet,Fatal,SMV Other,Driver,55 to 59,Fatal,42.0,East,"Automobile, Station Wagon",Going Ahead,Exceeding Speed Limit,Had Been Drinking,,,,,,,,,Yes,,,,,,Yes,Yes,,,,154,Oakdale-Beverley Heights,26,Downsview-Roding-CFB (26),D31,3991,
7524,7378164,1276186.0,2012,2012/01/20 05:00:00+00,2214,SENECA HILL DR,DON MILLS RD,,Collector,North York,17,43.790045,-79.35519,Mid-Block,,No Control,Snow,Dark,Packed Snow,Fatal,SMV Other,Driver,85 to 89,Fatal,4.0,West,"Automobile, Station Wagon",Going Ahead,Lost control,Unknown,,,,,,,,,Yes,,,,,,,,,,,47,Don Valley Village,47,Don Valley Village (47),D33,7525,
8694,7514335,1290105.0,2012,2012/04/20 04:00:00+00,1447,DON VALLEY PARKWAY N,GERRARD ST E,,,Toronto and East York,14,43.665145,-79.35589,Mid-Block,,No Control,Clear,Daylight,Dry,Fatal,SMV Other,Motorcycle Driver,35 to 39,Fatal,11.0,North,Motorcycle,Going Ahead,Exceeding Speed Limit,Unknown,,,,,,,,,,Yes,,,,,Yes,Yes,,,,68,North Riverdale,68,North Riverdale (68),D55,8695,
9470,7829849,1361106.0,2013,2013/06/16 04:00:00+00,616,JOE SHUSTER WAY,KING ST W,,Local,Toronto and East York,10,43.640547,-79.423999,Mid-Block,,No Control,Rain,Daylight,Wet,Fatal,SMV Other,Driver,40 to 44,Fatal,17.0,East,"Automobile, Station Wagon",Going Ahead,Other,Unknown,,,,,,,,,Yes,,,,,,,,,,,85,South Parkdale,85,South Parkdale (85),D14,9471,


In [274]:
# we will fill these null values with a unique number

In [275]:
null_values_indices = df[df['acc_num'].isnull()].index

sequence = range(4993, 4993 + len(null_values_indices))

df.loc[null_values_indices, 'acc_num'] = sequence

df["acc_num"].isna().sum()

0

In [276]:
df.acc_num.max()

5031.0

### Now dropping those records where acclass is non-fatal ans in other columns the value is null

In [277]:

drop_district = df[(df["district"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index
df = df.drop(drop_district, axis=0)

drop_loccoord = df[(df["loccoord"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index
df = df.drop(drop_loccoord, axis=0)

drop_injury = df[(df["injury"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index
df = df.drop(drop_injury, axis=0)

In [278]:

# Label encoding those columns which have large number of null values but
# it is also important for model training which can not be dropped.

In [279]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

              Feature  Unique Values  Missing Values
0              index_           8520               0
1              accnum           3669            2250
2                year             13               0
3                date           3001               0
4                time           1261               0
5             street1           1460               0
6             street2           2271             723
7              offset            317            7417
8          road_class              9             227
9            district              4               0
10            wardnum             70               0
11           latitude           3332               0
12          longitude           3727               0
13           loccoord              7               0
14             accloc              8            3005
15           traffctl             10               5
16         visibility              8              14
17              light              9          

In [280]:
df.shape

(8520, 56)

In [281]:
df.acclass.value_counts()

acclass
Non-Fatal Injury    6498
Fatal               2022
Name: count, dtype: int64

### Filling vlues with name 'NA'(not applicable) in those columns which may affect in deciding whether the accident wiil be fatal or not, those columns are going to get fille by NA because we can not impute it by most occured values or any other imputation method because this way model will produce false results for test and it will reduce the overall perfomance of the model

### Columns to impute null by NA
* street2
* Injury

In [282]:
df['street2'] = df['street2'].fillna('not applicable')
df['accloc'] = df['accloc'].fillna('not applicable')
df['injury'] = df['injury'].fillna('not applicable')
df['pedact'] = df['pedact'].fillna('not applicable')
df['initdir'] = df['initdir'].fillna('not applicable')
df['vehtype'] = df['vehtype'].fillna('not applicable')
df['manoeuver'] = df['manoeuver'].fillna('not applicable')
df['drivcond'] = df['drivcond'].fillna('not applicable')
df['drivact'] = df['drivact'].fillna('not applicable')
df['pedtype'] = df['pedtype'].fillna('not applicable')
df['pedcond'] = df['pedcond'].fillna('not applicable')
df['hood_158'] = df['hood_158'].fillna('not applicable')
df['hood_140'] = df['hood_140'].fillna('not applicable')

df['pedestrian'] = df['pedestrian'].fillna('NO')
df['automobile'] = df['automobile'].fillna('NO')
df['motorcycle'] = df['motorcycle'].fillna('NO')
df['truck'] = df['truck'].fillna('NO')
df['trsn_city_veh'] = df['trsn_city_veh'].fillna('NO')
df['emerg_veh'] = df['emerg_veh'].fillna('NO')
df['passenger'] = df['passenger'].fillna('NO')
df['speeding'] = df['speeding'].fillna('NO')
df['ag_driv'] = df['motorcycle'].fillna('NO')
df['redlight'] = df['redlight'].fillna('NO')
df['redlight'] = df['motorcycle'].fillna('NO')
df['alcohol'] = df['alcohol'].fillna('NO')




### Then filling most occured values inplace of missing values of those columns where if imputer with most occured it may not make big different and the numbers of missing values is less than 100

### Columns to impute with most occured values from the column
* road_class
* traffctl
* visibility
* rdsfcond
* invtype
* injury

In [283]:
df['road_class'] = df['road_class'].fillna('Major Arterial')
df['traffctl'] = df['traffctl'].fillna('No Control')
df['visibility'] = df['visibility'].fillna('Clear')
df['rdsfcond'] = df['rdsfcond'].fillna('Dry')
df['invtype'] = df['invtype'].fillna('Driver')
df['injury'] = df['injury'].fillna('None')

In [284]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

              Feature  Unique Values  Missing Values
0              index_           8520               0
1              accnum           3669            2250
2                year             13               0
3                date           3001               0
4                time           1261               0
5             street1           1460               0
6             street2           2272               0
7              offset            317            7417
8          road_class              9               0
9            district              4               0
10            wardnum             70               0
11           latitude           3332               0
12          longitude           3727               0
13           loccoord              7               0
14             accloc              9               0
15           traffctl             10               0
16         visibility              8               0
17              light              9          

In [285]:
df.shape

(8520, 56)

### lable encoding the rest of the columns, as it is important to keep those features though they are containing null values and can not be filled with random imputation.

In [286]:
columns_to_encode = ['vehtype', 'manoeuver', 'drivact', 'drivcond', 'pedtype', 'pedcond',
                     'disability', 'alcohol', 'redlight', 'ag_driv', 'speeding', 'passenger',
                     'emerg_veh', 'trsn_city_veh', 'truck', 'motorcycle', 'automobile',
                     'cyclist', 'pedestrian']



label_encoder = LabelEncoder()

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

# Dropping columns
* index_
* year
* date
* time
* offset
* fatal_no
* ccyclisttype
* cycond
* disability
* neighbout_140
* neighbour_158
* Fatal_no

In [287]:
df = df.drop(columns=['index_','latitude','longitude','accnum','year', 'date', 'offset','fatal_no','cyclistype','cyclist','cycact','cyccond','disability','neighbourhood_158','neighbourhood_140'], axis=1)

In [288]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

          Feature  Unique Values  Missing Values
0            time           1261               0
1         street1           1460               0
2         street2           2272               0
3      road_class              9               0
4        district              4               0
5         wardnum             70               0
6        loccoord              7               0
7          accloc              9               0
8        traffctl             10               0
9      visibility              8               0
10          light              9               0
11       rdsfcond              8               0
12        acclass              2               0
13      impactype             10               0
14        invtype             16               0
15         invage             21               0
16         injury              5               0
17        initdir              6               0
18        vehtype             25               0
19      manoeuver   

In [289]:
df

Unnamed: 0,time,street1,street2,road_class,district,wardnum,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,pedestrian,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,hood_158,hood_140,division,objectid,acc_num
0,852,BLOOR ST W,DUNDAS ST W,Major Arterial,Toronto and East York,4,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,not applicable,South,0,13,3,9,16,not applicable,10,1,1,0,0,0,0,0,0,0,0,0,88,88,D11,1,60.0
1,852,BLOOR ST W,DUNDAS ST W,Major Arterial,Toronto and East York,4,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,North,9,16,13,10,12,Crossing with right of way,9,1,1,0,0,0,0,0,0,0,0,0,88,88,D11,2,60.0
2,915,MORNINGSIDE AVE,SHEPPARD AVE E,Major Arterial,Scarborough,25,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,East,7,14,0,9,16,not applicable,10,0,1,1,0,0,0,0,0,1,1,0,146,132,D42,3,61.0
3,236,WOODBINE AVE,O CONNOR DR,Major Arterial,Toronto and East York,19,Intersection,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,not applicable,24,16,13,10,16,not applicable,10,0,1,0,0,0,0,1,1,0,0,1,60,60,D55,4,0.0
4,915,MORNINGSIDE AVE,SHEPPARD AVE E,Major Arterial,Scarborough,25,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,not applicable,South,0,2,1,9,16,not applicable,10,0,1,1,0,0,0,0,0,1,1,0,146,132,D42,5,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14989,1942,ISLINGTON AVE,ALBION RD,Major Arterial,Etobicoke York,1,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Driver,20 to 24,Major,North,0,2,1,7,16,not applicable,10,0,1,0,0,0,0,1,0,0,0,0,3,3,D23,14990,4809.0
14990,1755,BRIMLEY RD,BRIMWOOD BLVD,Major Arterial,Scarborough,23,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Non-Fatal Injury,Cyclist Collisions,Cyclist,30 to 34,Major,South,1,13,13,10,16,not applicable,10,0,1,0,0,0,0,0,0,0,0,0,129,129,D42,14991,4992.0
14991,1525,EGLINTON AVE W,KIPLING AVE,Major Arterial,Etobicoke York,2,Mid-Block,Non Intersection,No Control,Clear,Daylight,Dry,Non-Fatal Injury,Approaching,Driver,55 to 59,Major,East,11,2,1,7,16,not applicable,10,0,1,0,0,0,0,0,1,0,0,0,10,7,D22,14992,4772.0
14992,1525,EGLINTON AVE W,KIPLING AVE,Major Arterial,Etobicoke York,2,Mid-Block,Non Intersection,No Control,Clear,Daylight,Dry,Non-Fatal Injury,Approaching,Driver,60 to 64,Minor,East,0,2,1,7,16,not applicable,10,0,1,0,0,0,0,0,1,0,0,0,10,7,D22,14993,4772.0


### From the INJURY column there are labels with fatal and non-fatal injury, so we should try trainig our model with this feature and without this feature. Becuase the label fatal is directly impacting the result of acclass as fatal

### ONE HOT ENCODING WITH INJURY

In [290]:
one_hot_encoded_data = pd.get_dummies(df['injury'], prefix='injury')

df = pd.concat([df, one_hot_encoded_data], axis=1)

In [291]:
df.columns

Index(['time', 'street1', 'street2', 'road_class', 'district', 'wardnum',
       'loccoord', 'accloc', 'traffctl', 'visibility', 'light', 'rdsfcond',
       'acclass', 'impactype', 'invtype', 'invage', 'injury', 'initdir',
       'vehtype', 'manoeuver', 'drivact', 'drivcond', 'pedtype', 'pedact',
       'pedcond', 'pedestrian', 'automobile', 'motorcycle', 'truck',
       'trsn_city_veh', 'emerg_veh', 'passenger', 'speeding', 'ag_driv',
       'redlight', 'alcohol', 'hood_158', 'hood_140', 'division', 'objectid',
       'acc_num', 'injury_Fatal', 'injury_Major', 'injury_Minimal',
       'injury_Minor', 'injury_not applicable'],
      dtype='object')

In [292]:
# dropping the fatal-class of new columns and injury column
df = df.drop(columns=['injury_Fatal','injury'], axis=1)

# Try-2 Time sagment

### For time segment we will try to divide the time into time segments such as Night, Morining, Afternoon, evening, Mid-night

In [293]:
def map_time_to_segment(time):
    if time < 600:
        return 'Midnight'
    elif time < 1200:
        return 'Morning'
    elif time < 1700:
        return 'Afternoon'
    elif time < 2100:
        return 'Evening'
    else:
        return 'Night'

df['time_segment'] = df['time'].apply(map_time_to_segment)

In [294]:
# Now dropping the time column
df = df.drop('time', axis=1)

In [295]:
columns_to_encode = ['street1','traffctl','division','injury_Minor','street2','road_class','district','acclass','loccoord','accloc','accloc','visibility','light','rdsfcond','impactype','invtype','invage','initdir','pedact',"injury_Major",'injury_Minimal','injury_not applicable','hood_158','hood_140','time_segment']

label_encoder = LabelEncoder()

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

In [296]:
# converting float values into int

In [297]:
df['acc_num'] = df['acc_num'].astype(int)
df['division'] = df['division'].astype(int)


# Fitting data into models
* Spliting the data into train and test
* Apply data on model with cross validation
* Check the accuracy and classification report
* Check the confusion matrix


In [298]:
# splitting data into train and test split

In [299]:
x = df.drop('acclass',axis=1)
y = df['acclass']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# Decision tree

In [300]:
model_tree = DecisionTreeClassifier()

model_tree.fit(x_train,y_train)

pred_tree = model_tree.predict(x_test)

accuracy_tree = accuracy_score(y_test, pred_tree)
print('Accuracy before CV score: ',accuracy_tree)
print('Classification report before cross-validation')
print(classification_report(y_test, pred_tree))

Accuracy before CV score:  0.952112676056338
Classification report before cross-validation
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       536
           1       0.97      0.96      0.97      1594

    accuracy                           0.95      2130
   macro avg       0.93      0.94      0.94      2130
weighted avg       0.95      0.95      0.95      2130



In [313]:
cv_tree_scores = cross_val_score(model_tree, x, y, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_tree_scores)
print("Mean CV Accuracy:", np.mean(cv_tree_scores))

Cross-validation scores: [0.91197183 0.94835681 0.93955399 0.91255869 0.41138498]
Mean CV Accuracy: 0.8247652582159624
