In [220]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier


In [221]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [222]:
# df = pd.read_csv('/kaggle/input/is-the-traffic-collision-fatal/Train.csv')
df = pd.read_csv('drive/MyDrive/Datasets/Train.csv')

In [223]:
# displaying maximum columns
pd.set_option('display.max_columns', 200)

# Cleaning for model building

### Converting column names to lower case

In [224]:
df.columns = map(str.lower, df.columns)

In [225]:
df.sample()

Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid
12976,80542714,,2016,2016/05/31 04:00:00+00,1756,LAWRENCE AVE W,LITTLE AVE,5 m West of,Major Arterial,Etobicoke York,5,43.699214,-79.51984,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Non-Fatal Injury,Pedestrian Collisions,Pedestrian,55 to 59,Major,,North,,,,,Vehicle turns right while ped crosses with ROW...,Crossing with right of way,Normal,,,,Yes,,Yes,,,,,Yes,,Yes,,,,113,Weston,113,Weston (113),D12,12977


### Displaying number of unique and missing values of all columns

In [226]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

              Feature  Unique Values  Missing Values
0              index_          15000               0
1              accnum           3822            3698
2                year             13               0
3                date           3082               0
4                time           1276               0
5             street1           1547               0
6             street2           2344            1343
7              offset            339           13072
8          road_class              9             357
9            district              4              14
10            wardnum             71               0
11           latitude           3475               0
12          longitude           3901               0
13           loccoord              7              90
14             accloc              9            5450
15           traffctl             10              29
16         visibility              8              14
17              light              9          

In [227]:
df.head(5)

Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid
0,3387730,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4,43.656345,-79.45249,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,,,South,"Automobile, Station Wagon",Turning Left,Failed to Yield Right of Way,Unknown,,,,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,1
1,3387731,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4,43.656345,-79.45249,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,,North,Other,,,,Vehicle turns left while ped crosses with ROW ...,Crossing with right of way,Unknown,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,2
2,3388101,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25,43.801943,-79.199786,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,,East,Motorcycle,Turning Right,Disobeyed Traffic Control,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,3
3,3389067,893184.0,2006,2006/01/01 05:00:00+00,236,WOODBINE AVE,O CONNOR DR,,Major Arterial,Toronto and East York,19,43.699595,-79.318797,Intersection,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,,,,,,,,,,,,,,,Yes,,,,,Yes,Yes,Yes,,Yes,,60,Woodbine-Lumsden,60,Woodbine-Lumsden (60),D55,4
4,3388102,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25,43.801943,-79.199786,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,,,South,"Automobile, Station Wagon",Going Ahead,Driving Properly,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,5


In [228]:
# Dropping those null records from column district and loccord where
# acclass is non-fatal, because for model training there are enough values
# for non-fatal injury but not enough for fatal

In [229]:
df['acclass'].value_counts()

acclass
Non-Fatal Injury    12978
Fatal                2022
Name: count, dtype: int64

In [230]:
df[(df['accnum'].isna())&(df['acclass'] == "Non-Fatal Injury")]


Unnamed: 0,index_,accnum,year,date,time,street1,street2,offset,road_class,district,wardnum,latitude,longitude,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,fatal_no,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,cyclistype,cycact,cyccond,pedestrian,cyclist,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,hood_158,neighbourhood_158,hood_140,neighbourhood_140,division,objectid
10440,81153095,,2013,2013/12/03 05:00:00+00,700,SCARLETT RD,LAWRENCE AV W,,Major Arterial,Etobicoke York,2,43.698177,-79.523266,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Non-Fatal Injury,Pedestrian Collisions,Driver,50 to 54,,,North,Pick Up Truck,Going Ahead,Driving Properly,Normal,,,,,,,Yes,,Yes,,,,,,,,,,,8,Humber Heights-Westmount,8,Humber Heights-Westmount (8),D23,10441
10449,81153096,,2013,2013/12/03 05:00:00+00,700,SCARLETT RD,LAWRENCE AV W,,Major Arterial,Etobicoke York,2,43.698177,-79.523266,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Non-Fatal Injury,Pedestrian Collisions,Pedestrian,10 to 14,Major,,,,,,,,,Inattentive,,,,Yes,,Yes,,,,,,,,,,,8,Humber Heights-Westmount,8,Humber Heights-Westmount (8),D23,10450
11000,80236768,,2015,2015/04/22 04:00:00+00,100,KINGSTON RD,COLUMBINE AVE,9 m West of,Major Arterial,Toronto and East York,19,43.671492,-79.310367,Intersection,Non Intersection,No Control,Rain,Dark,Wet,Non-Fatal Injury,SMV Unattended Vehicle,Driver,45 to 49,Major,,West,"Automobile, Station Wagon",Going Ahead,Lost control,Unknown,,,,,,,,,Yes,,,,,,,Yes,,,,63,The Beaches,63,The Beaches (63),D55,11001
11001,80247514,,2015,2015/08/12 04:00:00+00,1524,WILSON AVE,KELVIN AVE,,Major Arterial,Etobicoke York,7,43.715689,-79.531894,Intersection,At Intersection,Stop Sign,Clear,Daylight,Dry,Non-Fatal Injury,Cyclist Collisions,Driver,50 to 54,,,East,"Automobile, Station Wagon",Going Ahead,Driving Properly,Normal,,,,,,,,Yes,Yes,,,,,,,,,,,23,Pelmo Park-Humberlea,23,Pelmo Park-Humberlea (23),D31,11002
11002,80565532,,2015,2015/04/22 04:00:00+00,1402,3 RAINIER SQ,,4 m South of,Local,Scarborough,22,43.794242,-79.303012,Intersection,Private Driveway,Stop Sign,Clear,Daylight,Dry,Non-Fatal Injury,SMV Other,Driver,20 to 24,Minimal,,South,"Automobile, Station Wagon",Turning Left,Lost control,Normal,,,,,,,,,Yes,,,,,Yes,Yes,Yes,,,,148,East L'Amoreaux,117,L'Amoreaux (117),D42,11003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,81474608,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,25 to 29,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14996
14996,81474609,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,0 to 4,Minor,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14997
14997,81474610,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,5 to 9,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14998
14998,81474611,,2018,2018/04/26 04:00:00+00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,1,43.737166,-79.565257,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Passenger,5 to 9,,,,,,,,,,,,,,,,Yes,,,,,Yes,,Yes,,,,3,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14999


### Creating a new accnum column where if Time,Street1,Date,Year is same found same in rows then it wiil be considered as one accident with multiple parties involved, andd if the condition does not matches with the condition it means there is only one person involve in the accident, and those accidents will be given accnum with other method.

In [231]:
one_accident = df[df.duplicated(subset=['year','date','time','street1'], keep=False)]

one_accident['acc_num'] = one_accident.groupby(['year','date','time','street1']).ngroup()

df = pd.merge(df, one_accident[['acc_num']], left_index=True, right_index=True, how='left')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_accident['acc_num'] = one_accident.groupby(['year','date','time','street1']).ngroup()


In [232]:
# There are null values in acc_num where acclass is fatal, because these are the records where condition is not matching
df[(df["acc_num"].isna()) & (df["acclass"] == "Fatal")].shape

(39, 56)

In [233]:
# We can drop those records having null values in acc_num where acclass is non-fatal

In [234]:
to_drop2 = df[(df["acc_num"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index

df = df.drop(to_drop2, axis=0)

In [235]:
df['acc_num'].max()

4992.0

In [236]:
df[df['acc_num'].isna()].shape

(39, 56)

In [237]:
# we will fill these null values with a unique number

In [238]:
null_values_indices = df[df['acc_num'].isnull()].index

sequence = range(4993, 4993 + len(null_values_indices))

df.loc[null_values_indices, 'acc_num'] = sequence

df["acc_num"].isna().sum()

0

In [239]:
df.acc_num.max()

5031.0

### Now dropping those records where acclass is non-fatal ans in other columns the value is null

In [240]:

drop_district = df[(df["district"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index
df = df.drop(drop_district, axis=0)

drop_loccoord = df[(df["loccoord"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index
df = df.drop(drop_loccoord, axis=0)

drop_injury = df[(df["injury"].isna()) & (df["acclass"] == "Non-Fatal Injury")].index
df = df.drop(drop_injury, axis=0)

In [241]:

# Label encoding those columns which have large number of null values but
# it is also important for model training which can not be dropped.

In [242]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

              Feature  Unique Values  Missing Values
0              index_           8520               0
1              accnum           3669            2250
2                year             13               0
3                date           3001               0
4                time           1261               0
5             street1           1460               0
6             street2           2271             723
7              offset            317            7417
8          road_class              9             227
9            district              4               0
10            wardnum             70               0
11           latitude           3332               0
12          longitude           3727               0
13           loccoord              7               0
14             accloc              8            3005
15           traffctl             10               5
16         visibility              8              14
17              light              9          

In [243]:
df.shape

(8520, 56)

In [244]:
df.acclass.value_counts()

acclass
Non-Fatal Injury    6498
Fatal               2022
Name: count, dtype: int64

### Filling vlues with name 'NA'(not applicable) in those columns which may affect in deciding whether the accident wiil be fatal or not, those columns are going to get fille by NA because we can not impute it by most occured values or any other imputation method because this way model will produce false results for test and it will reduce the overall perfomance of the model

### Columns to impute null by NA
* street2
* Injury

In [245]:
df['street2'] = df['street2'].fillna('not applicable')
df['accloc'] = df['accloc'].fillna('not applicable')
df['injury'] = df['injury'].fillna('not applicable')
df['pedact'] = df['pedact'].fillna('not applicable')
df['initdir'] = df['initdir'].fillna('not applicable')
df['vehtype'] = df['vehtype'].fillna('not applicable')
df['manoeuver'] = df['manoeuver'].fillna('not applicable')
df['drivcond'] = df['drivcond'].fillna('not applicable')
df['drivact'] = df['drivact'].fillna('not applicable')
df['pedtype'] = df['pedtype'].fillna('not applicable')
df['pedcond'] = df['pedcond'].fillna('not applicable')
df['hood_158'] = df['hood_158'].fillna('not applicable')
df['hood_140'] = df['hood_140'].fillna('not applicable')

df['pedestrian'] = df['pedestrian'].fillna('NO')
df['automobile'] = df['automobile'].fillna('NO')
df['motorcycle'] = df['motorcycle'].fillna('NO')
df['truck'] = df['truck'].fillna('NO')
df['trsn_city_veh'] = df['trsn_city_veh'].fillna('NO')
df['emerg_veh'] = df['emerg_veh'].fillna('NO')
df['passenger'] = df['passenger'].fillna('NO')
df['speeding'] = df['speeding'].fillna('NO')
df['ag_driv'] = df['motorcycle'].fillna('NO')
df['redlight'] = df['redlight'].fillna('NO')
df['redlight'] = df['motorcycle'].fillna('NO')
df['alcohol'] = df['alcohol'].fillna('NO')




### Then filling most occured values inplace of missing values of those columns where if imputer with most occured it may not make big different and the numbers of missing values is less than 100

### Columns to impute with most occured values from the column
* road_class
* traffctl
* visibility
* rdsfcond
* invtype
* injury

In [246]:
df['road_class'] = df['road_class'].fillna('Major Arterial')
df['traffctl'] = df['traffctl'].fillna('No Control')
df['visibility'] = df['visibility'].fillna('Clear')
df['rdsfcond'] = df['rdsfcond'].fillna('Dry')
df['invtype'] = df['invtype'].fillna('Driver')
df['injury'] = df['injury'].fillna('None')

In [247]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

              Feature  Unique Values  Missing Values
0              index_           8520               0
1              accnum           3669            2250
2                year             13               0
3                date           3001               0
4                time           1261               0
5             street1           1460               0
6             street2           2272               0
7              offset            317            7417
8          road_class              9               0
9            district              4               0
10            wardnum             70               0
11           latitude           3332               0
12          longitude           3727               0
13           loccoord              7               0
14             accloc              9               0
15           traffctl             10               0
16         visibility              8               0
17              light              9          

In [248]:
df.shape

(8520, 56)

### lable encoding the rest of the columns, as it is important to keep those features though they are containing null values and can not be filled with random imputation.

In [249]:
columns_to_encode = ['vehtype',
                     'manoeuver',
                     'drivact',
                     'drivcond',
                     'pedtype',
                     'pedcond',
                     'disability',
                     'alcohol',
                     'redlight',
                     'ag_driv',
                     'speeding',
                     'passenger',
                     'emerg_veh',
                     'trsn_city_veh',
                     'truck',
                     'motorcycle',
                     'automobile',
                     'cyclist',
                     'pedestrian']



label_encoder = LabelEncoder()

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

# Dropping columns
* index_
* year
* date
* time
* offset
* fatal_no
* ccyclisttype
* cycond
* disability
* neighbout_140
* neighbour_158
* Fatal_no

In [250]:
df = df.drop(columns=['index_',
                      'latitude',
                      'longitude',
                      'accnum',
                      'year',
                      'date',
                      'offset',
                      'fatal_no',
                      'cyclistype',
                      'cyclist',
                      'cycact',
                      'cyccond',
                      'disability',
                      'neighbourhood_158',
                      'neighbourhood_140'
                      ], axis=1)

In [251]:
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

print(combined_df)

          Feature  Unique Values  Missing Values
0            time           1261               0
1         street1           1460               0
2         street2           2272               0
3      road_class              9               0
4        district              4               0
5         wardnum             70               0
6        loccoord              7               0
7          accloc              9               0
8        traffctl             10               0
9      visibility              8               0
10          light              9               0
11       rdsfcond              8               0
12        acclass              2               0
13      impactype             10               0
14        invtype             16               0
15         invage             21               0
16         injury              5               0
17        initdir              6               0
18        vehtype             25               0
19      manoeuver   

In [252]:
df

Unnamed: 0,time,street1,street2,road_class,district,wardnum,loccoord,accloc,traffctl,visibility,light,rdsfcond,acclass,impactype,invtype,invage,injury,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,pedestrian,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,hood_158,hood_140,division,objectid,acc_num
0,852,BLOOR ST W,DUNDAS ST W,Major Arterial,Toronto and East York,4,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,not applicable,South,0,13,3,9,16,not applicable,10,1,1,0,0,0,0,0,0,0,0,0,88,88,D11,1,60.0
1,852,BLOOR ST W,DUNDAS ST W,Major Arterial,Toronto and East York,4,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,North,9,16,13,10,12,Crossing with right of way,9,1,1,0,0,0,0,0,0,0,0,0,88,88,D11,2,60.0
2,915,MORNINGSIDE AVE,SHEPPARD AVE E,Major Arterial,Scarborough,25,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,East,7,14,0,9,16,not applicable,10,0,1,1,0,0,0,0,0,1,1,0,146,132,D42,3,61.0
3,236,WOODBINE AVE,O CONNOR DR,Major Arterial,Toronto and East York,19,Intersection,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,not applicable,24,16,13,10,16,not applicable,10,0,1,0,0,0,0,1,1,0,0,1,60,60,D55,4,0.0
4,915,MORNINGSIDE AVE,SHEPPARD AVE E,Major Arterial,Scarborough,25,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,not applicable,South,0,2,1,9,16,not applicable,10,0,1,1,0,0,0,0,0,1,1,0,146,132,D42,5,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14989,1942,ISLINGTON AVE,ALBION RD,Major Arterial,Etobicoke York,1,Intersection,At Intersection,Traffic Signal,Clear,"Dusk, artificial",Dry,Non-Fatal Injury,Turning Movement,Driver,20 to 24,Major,North,0,2,1,7,16,not applicable,10,0,1,0,0,0,0,1,0,0,0,0,3,3,D23,14990,4809.0
14990,1755,BRIMLEY RD,BRIMWOOD BLVD,Major Arterial,Scarborough,23,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Non-Fatal Injury,Cyclist Collisions,Cyclist,30 to 34,Major,South,1,13,13,10,16,not applicable,10,0,1,0,0,0,0,0,0,0,0,0,129,129,D42,14991,4992.0
14991,1525,EGLINTON AVE W,KIPLING AVE,Major Arterial,Etobicoke York,2,Mid-Block,Non Intersection,No Control,Clear,Daylight,Dry,Non-Fatal Injury,Approaching,Driver,55 to 59,Major,East,11,2,1,7,16,not applicable,10,0,1,0,0,0,0,0,1,0,0,0,10,7,D22,14992,4772.0
14992,1525,EGLINTON AVE W,KIPLING AVE,Major Arterial,Etobicoke York,2,Mid-Block,Non Intersection,No Control,Clear,Daylight,Dry,Non-Fatal Injury,Approaching,Driver,60 to 64,Minor,East,0,2,1,7,16,not applicable,10,0,1,0,0,0,0,0,1,0,0,0,10,7,D22,14993,4772.0


### From the INJURY column there are labels with fatal and non-fatal injury, so we should try trainig our model with this feature and without this feature. Becuase the label fatal is directly impacting the result of acclass as fatal

### ONE HOT ENCODING WITH INJURY

In [253]:
# one_hot_encoded_data = pd.get_dummies(df['injury'], prefix='injury')

# df = pd.concat([df, one_hot_encoded_data], axis=1)

In [254]:
df.columns

Index(['time', 'street1', 'street2', 'road_class', 'district', 'wardnum',
       'loccoord', 'accloc', 'traffctl', 'visibility', 'light', 'rdsfcond',
       'acclass', 'impactype', 'invtype', 'invage', 'injury', 'initdir',
       'vehtype', 'manoeuver', 'drivact', 'drivcond', 'pedtype', 'pedact',
       'pedcond', 'pedestrian', 'automobile', 'motorcycle', 'truck',
       'trsn_city_veh', 'emerg_veh', 'passenger', 'speeding', 'ag_driv',
       'redlight', 'alcohol', 'hood_158', 'hood_140', 'division', 'objectid',
       'acc_num'],
      dtype='object')

In [255]:
# dropping the fatal-class of new columns and injury column
# df = df.drop(columns=[
#                       # 'injury_Fatal',
#                       'injury'
#                       ], axis=1)

# ------------------------ Try-2 Time sagment ------------------------

### For time segment we will try to divide the time into time segments such as Night, Morining, Afternoon, evening, Mid-night

In [256]:
def map_time_to_segment(time):
    if time < 600:
        return 'Midnight'
    elif time < 1200:
        return 'Morning'
    elif time < 1700:
        return 'Afternoon'
    elif time < 2100:
        return 'Evening'
    else:
        return 'Night'

df['time_segment'] = df['time'].apply(map_time_to_segment)

In [257]:
# Now dropping the time column
df = df.drop('time', axis=1)

In [258]:
columns_to_encode = ['street1',
                     'traffctl',
                     'division',
                     'street2',
                     'road_class',
                     'district',
                     'acclass',
                     'loccoord',
                     'accloc',
                     'accloc',
                     'injury',
                     'visibility',
                     'light',
                     'rdsfcond',
                     'impactype',
                     'invtype',
                     'invage',
                     'initdir',
                     'pedact',
                    #  'injury_Minor',
                    #  "injury_Major",
                    #  'injury_Minimal',
                    #  'injury_Fatal',
                    #  'injury_not applicable',
                     'hood_158',
                     'hood_140',
                     'time_segment'
                     ]

label_encoder = LabelEncoder()

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

In [259]:
# converting float values into int

In [260]:
df['acc_num'] = df['acc_num'].astype(int)
df['division'] = df['division'].astype(int)


# --------------------------Cleaning Test Data accordingly  --------------------------

In [261]:
dft = pd.read_csv('drive/MyDrive/Datasets/Test.csv')


In [262]:
dft.columns = map(str.lower, dft.columns)


In [263]:
null_counts_dft = df.isnull().sum()
null_dft = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values_dft = dft.nunique().reset_index()
unique_values_dft.columns = ["Feature", "Unique Values"]

combined_dft = pd.merge(unique_values_dft, null_dft, left_on="Feature", right_index=True)

print(combined_dft)

          Feature  Unique Values  Missing Values
4            time            742               0
5         street1            530               0
6         street2            742               0
8      road_class              8               0
9        district              5               0
10        wardnum             54               0
13       loccoord              4               0
14         accloc              6               0
15       traffctl              6               0
16     visibility              8               0
17          light              9               0
18       rdsfcond              8               0
19      impactype             10               0
20        invtype             14               0
21         invage             21               0
22         injury              4               0
24        initdir              5               0
25        vehtype             29               0
26      manoeuver             15               0
27        drivact   

In [264]:
one_accident_dft = dft[dft.duplicated(subset=['year','date','time','street1'], keep=False)]

one_accident_dft['acc_num'] = one_accident_dft.groupby(['year','date','time','street1']).ngroup()

dft = pd.merge(dft, one_accident_dft[['acc_num']], left_index=True, right_index=True, how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_accident_dft['acc_num'] = one_accident_dft.groupby(['year','date','time','street1']).ngroup()


In [265]:
dft['acc_num'].isna().sum()


101

In [266]:
dft.acc_num.max()


1189.0

In [267]:
null_values_indices = dft[dft['acc_num'].isnull()].index

sequence = range(1190, 1190 + len(null_values_indices))

dft.loc[null_values_indices, 'acc_num'] = sequence

dft["acc_num"].isna().sum()

0

In [268]:
dft.acc_num.max()


1290.0

In [269]:
dft['street2'] = dft['street2'].fillna('NA')
dft['accloc'] = dft['accloc'].fillna('NA')
dft['injury'] = dft['injury'].fillna('NA')
dft['pedact'] = dft['pedact'].fillna('NA')
dft['initdir'] = dft['initdir'].fillna('NA')
dft['vehtype'] = dft['vehtype'].fillna('NA')
dft['manoeuver'] = dft['manoeuver'].fillna('NA')
dft['drivcond'] = dft['drivcond'].fillna('NA')
dft['drivact'] = dft['drivact'].fillna('NA')
dft['pedtype'] = dft['pedtype'].fillna('NA')
dft['pedcond'] = dft['pedcond'].fillna('NA')
dft['hood_158'] = dft['hood_158'].fillna('NA')
dft['hood_140'] = dft['hood_140'].fillna('NA')

dft['pedestrian'] = dft['pedestrian'].fillna('NO')
dft['automobile'] = dft['automobile'].fillna('NO')
dft['motorcycle'] = dft['motorcycle'].fillna('NO')
dft['truck'] = dft['truck'].fillna('NO')
dft['trsn_city_veh'] = dft['trsn_city_veh'].fillna('NO')
dft['emerg_veh'] = dft['emerg_veh'].fillna('NO')
dft['passenger'] = dft['passenger'].fillna('NO')
dft['speeding'] = dft['speeding'].fillna('NO')
dft['ag_driv'] = dft['motorcycle'].fillna('NO')
dft['redlight'] = dft['redlight'].fillna('NO')
dft['redlight'] = dft['motorcycle'].fillna('NO')
dft['alcohol'] = dft['alcohol'].fillna('NO')

In [270]:
null_counts_dft = dft.isnull().sum()
null_dft = pd.DataFrame(null_counts, columns=["Missing Values"])

unique_values_dft = dft.nunique().reset_index()
unique_values_dft.columns = ["Feature", "Unique Values"]

combined_dft = pd.merge(unique_values_dft, null_dft, left_on="Feature", right_index=True)

print(combined_dft)

          Feature  Unique Values  Missing Values
4            time            742               0
5         street1            530               0
6         street2            743               0
8      road_class              8               0
9        district              5               0
10        wardnum             54               0
13       loccoord              4               0
14         accloc              6               0
15       traffctl              6               0
16     visibility              8               0
17          light              9               0
18       rdsfcond              8               0
19      impactype             10               0
20        invtype             14               0
21         invage             21               0
22         injury              5               0
24        initdir              6               0
25        vehtype             30               0
26      manoeuver             16               0
27        drivact   

In [271]:
columns_to_encode = ['vehtype',
                     'manoeuver',
                     'drivact',
                     'drivcond',
                     'pedtype',
                     'pedcond',
                     'disability',
                     'alcohol',
                     'injury',
                     'redlight',
                     'ag_driv',
                     'speeding',
                     'passenger',
                     'emerg_veh',
                     'trsn_city_veh',
                     'truck',
                     'motorcycle',
                     'automobile',
                     'cyclist',
                     'pedestrian']



label_encoder = LabelEncoder()

for column in columns_to_encode:
    dft[column] = label_encoder.fit_transform(dft[column])

In [272]:
dft = dft.drop(columns=['index_',
                        'latitude',
                        'longitude',
                        'accnum',
                        'year',
                        'date',
                        'offset',
                        'fatal_no',
                        'cyclistype',
                        'cyclist',
                        'cycact',
                        'cyccond',
                        'disability',
                        'neighbourhood_158',
                        'neighbourhood_140'
                        ], axis=1)


In [273]:
dft

Unnamed: 0,time,street1,street2,road_class,district,wardnum,loccoord,accloc,traffctl,visibility,light,rdsfcond,impactype,invtype,invage,injury,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,pedestrian,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,hood_158,hood_140,division,objectid,acc_num
0,1903,KING ST W,PORTLAND ST,Major Arterial,Toronto and East York,10.0,Intersection,At Intersection,Traffic Signal,Clear,Dark,Dry,Cyclist Collisions,Cyclist,45 to 49,1,West,2,1,12,10,15,not applicable,10,0,1,0,0,0,0,0,0,0,0,0,164,77,D14,15001,333.0
1,1903,KING ST W,PORTLAND ST,Major Arterial,Toronto and East York,10.0,Intersection,At Intersection,Traffic Signal,Clear,Dark,Dry,Cyclist Collisions,Driver,30 to 34,4,West,1,1,1,7,15,not applicable,10,0,1,0,0,0,0,0,0,0,0,0,164,77,D14,15002,333.0
2,1755,GARDINER LAKESHORE E,LOWER JARVIS ST,Expressway Ramp,Toronto and East York,10.0,Mid-Block,Non Intersection,No Control,Clear,Daylight,Dry,Pedestrian Collisions,Driver,25 to 29,4,East,1,1,1,7,15,not applicable,10,1,1,0,0,0,0,0,0,0,0,0,166,77,D51,15003,154.0
3,1910,LANSDOWNE AVE,WALLACE AVE,Minor Arterial,Etobicoke York,9.0,Mid-Block,At/Near Private Drive,No Control,Clear,Daylight,Dry,Pedestrian Collisions,Driver,25 to 29,4,South,1,1,8,5,15,not applicable,10,1,1,0,0,0,0,0,0,0,0,0,171,93,D11,15004,334.0
4,1910,LANSDOWNE AVE,WALLACE AVE,Minor Arterial,Etobicoke York,9.0,Mid-Block,At/Near Private Drive,No Control,Clear,Daylight,Dry,Pedestrian Collisions,Pedestrian,25 to 29,1,North,29,15,12,10,15,On Sidewalk or Shoulder,7,1,1,0,0,0,0,0,0,0,0,0,171,93,D11,15005,334.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,1750,GLOUCESTER GRV,WINNETT AVE,Local,,,Intersection,At Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,5 to 9,2,North,29,15,12,10,11,Crossing with right of way,7,1,1,0,0,0,0,0,0,0,0,0,106,106,D13,18190,1188.0
3190,1750,GLOUCESTER GRV,WINNETT AVE,Local,,,Intersection,At Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,0 to 4,3,North,29,15,12,10,11,Crossing with right of way,7,1,1,0,0,0,0,0,0,0,0,0,106,106,D13,18191,1188.0
3191,1750,GLOUCESTER GRV,WINNETT AVE,Local,,,Intersection,At Intersection,Stop Sign,Clear,Dark,Wet,Pedestrian Collisions,Pedestrian,0 to 4,2,North,29,15,12,10,11,Crossing with right of way,7,1,1,0,0,0,0,0,0,0,0,0,106,106,D13,18192,1188.0
3192,2234,BLOOR ST W,MONTROSE AVE,Major Arterial,Toronto and East York,,Mid-Block,Non Intersection,No Control,Rain,"Dark, artificial",Wet,Pedestrian Collisions,Driver,35 to 39,4,East,1,1,1,7,15,not applicable,10,1,1,0,0,0,0,0,0,0,0,0,80,80,D14,18193,1189.0


In [274]:
# one_hot_encoded_data = pd.get_dummies(dft['injury'], prefix='injury')

# dft = pd.concat([dft, one_hot_encoded_data], axis=1)

In [275]:
dft.columns


Index(['time', 'street1', 'street2', 'road_class', 'district', 'wardnum',
       'loccoord', 'accloc', 'traffctl', 'visibility', 'light', 'rdsfcond',
       'impactype', 'invtype', 'invage', 'injury', 'initdir', 'vehtype',
       'manoeuver', 'drivact', 'drivcond', 'pedtype', 'pedact', 'pedcond',
       'pedestrian', 'automobile', 'motorcycle', 'truck', 'trsn_city_veh',
       'emerg_veh', 'passenger', 'speeding', 'ag_driv', 'redlight', 'alcohol',
       'hood_158', 'hood_140', 'division', 'objectid', 'acc_num'],
      dtype='object')

In [276]:
# dft = dft.drop(columns=[
#     # 'injury_Fatal',
#     'injury'
#     ], axis=1)


In [277]:
def map_time_to_segment(time):
    if time < 600:
        return 'Midnight'
    elif time < 1200:
        return 'Morning'
    elif time < 1700:
        return 'Afternoon'
    elif time < 2100:
        return 'Evening'
    else:
        return 'Night'

dft['time_segment'] = dft['time'].apply(map_time_to_segment)

In [278]:
dft = dft.drop('time', axis=1)

In [279]:
columns_to_encode = ['street1',
                     'wardnum',
                     'traffctl',
                     'division',
                     'street2',
                     'road_class',
                     'district',
                     'loccoord',
                     'accloc',
                     'accloc',
                     'visibility',
                     'light',
                     'rdsfcond',
                     'impactype',
                     'invtype',
                     'invage',
                     'initdir',
                     'pedact',
                    #  'injury_Minor',
                    #  "injury_Major",
                    #  'injury_Minimal',
                    #  'injury_not applicable',
                     'hood_158',
                     'hood_140',
                     'time_segment'
                     ]

label_encoder = LabelEncoder()

for column in columns_to_encode:
    dft[column] = label_encoder.fit_transform(dft[column])

In [280]:
dft


Unnamed: 0,street1,street2,road_class,district,wardnum,loccoord,accloc,traffctl,visibility,light,rdsfcond,impactype,invtype,invage,injury,initdir,vehtype,manoeuver,drivact,drivcond,pedtype,pedact,pedcond,pedestrian,automobile,motorcycle,truck,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,hood_158,hood_140,division,objectid,acc_num,time_segment
0,327,542,5,4,9,0,0,4,0,0,0,2,0,8,1,4,2,1,12,10,15,14,10,0,1,0,0,0,0,0,0,0,0,0,65,112,3,15001,333.0,1
1,327,542,5,4,9,0,0,4,0,0,0,2,2,5,4,4,1,1,1,7,15,14,10,0,1,0,0,0,0,0,0,0,0,0,65,112,3,15002,333.0,1
2,278,437,2,4,9,1,4,0,0,4,0,4,2,4,4,0,1,1,1,7,15,14,10,1,1,0,0,0,0,0,0,0,0,0,67,112,12,15003,154.0,1
3,348,703,7,0,8,1,1,0,0,4,0,4,2,4,4,2,1,1,8,5,15,14,10,1,1,0,0,0,0,0,0,0,0,0,72,130,0,15004,334.0,1
4,348,703,7,0,8,1,1,0,0,4,0,4,10,4,1,1,29,15,12,10,15,6,7,1,1,0,0,0,0,0,0,0,0,0,72,130,0,15005,334.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,286,727,4,5,54,0,0,2,0,0,7,4,10,9,2,1,29,15,12,10,11,2,7,1,1,0,0,0,0,0,0,0,0,0,7,8,2,18190,1188.0,1
3190,286,727,4,5,54,0,0,2,0,0,7,4,10,0,3,1,29,15,12,10,11,2,7,1,1,0,0,0,0,0,0,0,0,0,7,8,2,18191,1188.0,1
3191,286,727,4,5,54,0,0,2,0,0,7,4,10,0,2,1,29,15,12,10,11,2,7,1,1,0,0,0,0,0,0,0,0,0,7,8,2,18192,1188.0,1
3192,183,478,5,4,54,1,4,0,5,1,7,4,2,6,4,0,1,1,1,7,15,14,10,1,1,0,0,0,0,0,0,0,0,0,136,116,3,18193,1189.0,4


# -------------------------------------- With train data --------------------------------------

# Fitting data into models
* Spliting the data into train and test
* Apply data on model with cross validation
* Check the accuracy and classification report
* Check the confusion matrix


In [281]:
# splitting data into train and test split

In [282]:
x = df.drop('acclass',axis=1)
y = df['acclass']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# Decision tree

In [291]:
model_tree = DecisionTreeClassifier()

model_tree.fit(x_train,y_train)

pred_tree = model_tree.predict(x_test)

accuracy_tree = accuracy_score(y_test, pred_tree)
print('Accuracy before CV score: ',accuracy_tree)
print('Classification report before cross-validation')
print(classification_report(y_test, pred_tree))

Accuracy before CV score:  0.9483568075117371
Classification report before cross-validation
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       510
           1       0.97      0.97      0.97      1620

    accuracy                           0.95      2130
   macro avg       0.93      0.93      0.93      2130
weighted avg       0.95      0.95      0.95      2130



In [293]:
# Get feature importance scores
feature_importance = model_tree.feature_importances_

# Convert feature importance to DataFrame
feature_importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importance})

# Sort DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importance
print("\nFeature Importance:")
print(feature_importance_df)


Feature Importance:
          Feature  Importance
14         injury    0.799877
38        acc_num    0.016370
37       objectid    0.016140
35       hood_140    0.014205
0         street1    0.014195
1         street2    0.013732
34       hood_158    0.011674
4         wardnum    0.010643
30       speeding    0.009938
11      impactype    0.007544
13         invage    0.007157
6          accloc    0.006656
18        drivact    0.006595
36       division    0.006481
15        initdir    0.005846
2      road_class    0.005188
19       drivcond    0.004458
9           light    0.004259
23     pedestrian    0.004208
16        vehtype    0.004107
26          truck    0.004077
39   time_segment    0.003931
27  trsn_city_veh    0.003901
29      passenger    0.003146
10       rdsfcond    0.003048
8      visibility    0.002535
33        alcohol    0.002168
5        loccoord    0.001696
20        pedtype    0.001343
3        district    0.001184
17      manoeuver    0.001157
12        invtype  

# #XGB

In [284]:
model_xg = xgb.XGBClassifier(objective="binary:logistic")

model_xg.fit(x_train, y_train)

pred_xg = model_xg.predict(x_test)

accuracy_xg = accuracy_score(y_test, pred_xg)

print(accuracy_xg)
print(classification_report(y_test, pred_xg))

0.9769953051643192
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       510
           1       0.97      1.00      0.99      1620

    accuracy                           0.98      2130
   macro avg       0.98      0.95      0.97      2130
weighted avg       0.98      0.98      0.98      2130



In [290]:
# Get feature importance scores
feature_importance = model_xg.get_booster().get_score(importance_type='weight')

# Convert feature importance to DataFrame for easier manipulation and visualization
feature_importance_df = pd.DataFrame(feature_importance.items(), columns=['Feature', 'Importance'])
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importance
print("\nFeature Importance:")
print(feature_importance_df)


Feature Importance:
          Feature  Importance
35        acc_num       190.0
1         street2       188.0
0         street1       173.0
34       objectid       145.0
14         injury       127.0
31       hood_158       115.0
4         wardnum       114.0
32       hood_140       104.0
13         invage        97.0
11      impactype        72.0
33       division        63.0
36   time_segment        60.0
6          accloc        52.0
28      passenger        37.0
29       speeding        36.0
15        initdir        33.0
9           light        32.0
3        district        29.0
10       rdsfcond        27.0
18        drivact        26.0
23     pedestrian        26.0
7        traffctl        26.0
16        vehtype        24.0
19       drivcond        17.0
26          truck        17.0
17      manoeuver        15.0
22        pedcond        15.0
8      visibility        14.0
5        loccoord        14.0
2      road_class        12.0
27  trsn_city_veh        12.0
20        pedtype  

# ------------------------- On Test Data ---------------------------------------

In [285]:
# splitting data for training and testing model on test data
x = df.drop('acclass', axis=1)
y = df['acclass']

In [286]:
df.columns

Index(['street1', 'street2', 'road_class', 'district', 'wardnum', 'loccoord',
       'accloc', 'traffctl', 'visibility', 'light', 'rdsfcond', 'acclass',
       'impactype', 'invtype', 'invage', 'injury', 'initdir', 'vehtype',
       'manoeuver', 'drivact', 'drivcond', 'pedtype', 'pedact', 'pedcond',
       'pedestrian', 'automobile', 'motorcycle', 'truck', 'trsn_city_veh',
       'emerg_veh', 'passenger', 'speeding', 'ag_driv', 'redlight', 'alcohol',
       'hood_158', 'hood_140', 'division', 'objectid', 'acc_num',
       'time_segment'],
      dtype='object')

In [287]:
dft.columns

Index(['street1', 'street2', 'road_class', 'district', 'wardnum', 'loccoord',
       'accloc', 'traffctl', 'visibility', 'light', 'rdsfcond', 'impactype',
       'invtype', 'invage', 'injury', 'initdir', 'vehtype', 'manoeuver',
       'drivact', 'drivcond', 'pedtype', 'pedact', 'pedcond', 'pedestrian',
       'automobile', 'motorcycle', 'truck', 'trsn_city_veh', 'emerg_veh',
       'passenger', 'speeding', 'ag_driv', 'redlight', 'alcohol', 'hood_158',
       'hood_140', 'division', 'objectid', 'acc_num', 'time_segment'],
      dtype='object')

# Decision Tree TEST data

In [288]:
model_dtree  = DecisionTreeClassifier()

model_dtree.fit(x, y)

dtree_pred = model_dtree.predict(dft)

class_mapping_dtree = {0: 'Fatal', 1: 'Non-Fatal Injury'}
pred_labels_dtree = [class_mapping_dtree[dtree_pred] for dtree_pred in dtree_pred]

# Create a DataFrame for predictions and save it to a CSV file
output_df = pd.DataFrame({'ObjectId': range(15001, 15001 + len(pred_labels_dtree)), 'ACCLASS': pred_labels_dtree})
output_df.to_csv('predictions_wo_injurytime_dtree.csv', index=False)

# XGB with TEST data

In [289]:
model_xgbt  = xgb.XGBClassifier(objective="binary:logistic")

model_xgbt.fit(x, y)

xgb_pred = model_xgbt.predict(dft)

class_mapping_xgb = {0: 'Fatal', 1: 'Non-Fatal Injury'}
pred_labels_xgb = [class_mapping_xgb[xgb_pred] for xgb_pred in xgb_pred]

# Create a DataFrame for predictions and save it to a CSV file
output_df = pd.DataFrame({'ObjectId': range(15001, 15001 + len(pred_labels_xgb)), 'ACCLASS': pred_labels_xgb})
output_df.to_csv('predictions_wo_injurytime_xgb.csv', index=False)