### **Mounting** on the Google Drive for the **Dataset**.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **Reading** the Dataset and saving it in **accident**.

In [2]:
import pandas as pd
import numpy as np

accident = pd.read_csv('/content/drive/MyDrive/KCP/Copy of AccidentReports.csv')

### Seeing that there are **301761 rows** and **36 columns** in our Dataset. With many columns that might not be useful for us and a lot of NaN values.

In [3]:
accident

Unnamed: 0,DISTRICTNAME,UNITNAME,Crime_No,Year,RI,Noofvehicle_involved,Accident_Classification,Accident_Spot,Accident_Location,Accident_SubLocation,...,RoadJunction,Collision_TypeB,Accident_Road,Landmark_first,landmark_second,Distance_LandMark_First,Distance_LandMark_Second,Accident_Description,Latitude,Longitude
0,Bagalkot,Amengad PS,10470124520160139,2016,1,1,Road Accidents,Bottleneck,Rural Areas,Open area,...,,,AMINAGAD BAGALKOT SGH-20 ROAD NEAR TIPPANNA GO...,8,,8,,AMINAGADA TO BAGALKOT SH-20 ROAD NEAR TIPPANNA...,0.000000,0.000000
1,Bagalkot,Amengad PS,10470124520160143,2016,1,1,Road Accidents,Bridge,Villages settlement,Narrow bridge or culverts,...,,,SHIRUR AMINAGAD SH-20 ROAD NEAR KAMATAGI,14,,14,,SHIRUR AMINAGAD SH-20 ROAD NEAR KAMATAGI,0.000000,0.000000
2,Bagalkot,Amengad PS,10470124520160056,2016,1,2,Road Accidents,Bottleneck,City/Town,Near School or College,...,,,AMINAGAD TO BAGALKOT SH-20 ROAD NEAR BANATHIKOLLA,BANATHIKOLLA,,3,,AMINAGAD BAGALKOT SH-20 NEAR BANATHIKOLL,0.000000,0.000000
3,Bagalkot,Amengad PS,10470124520160134,2016,1,2,Road Accidents,Bottleneck,Rural Areas,Residential area,...,,,AMINAGAD BAGALKOT ROAD NEAR ADILASHA HOTEL,500 MITER,,500M,,AMINAGAD BAGALKOT SH-20 ROAD NEAR ADILSHA HOTELA,0.000000,0.000000
4,Bagalkot,Amengad PS,10470124520160161,2016,1,1,Road Accidents,Cross roads,City/Town,At pedestrian Crossing,...,,,AMD TO BGK SH-20 ROAD NEAR AMINGAD SULEBAVI CROSS,100MM,,100MM,,AMD TO BGK SH-20 ROAD NEAR AMINAGAD SULEBAVI C...,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301756,Yadgir,Yadgiri Traffic PS,10978215920230010,2023,1,2,Road Accidents,Cross roads,City/Town,Near School or College,...,,Run Off Road,NEAR DEGREE COLLEGE CROSS YADAGIRI,01 KM,,01 KM,,ಪೂರ್ವಕ್ಕೆ- ಯಾದಗಿರಿ ನಗರದ ಕಡೆ ಬರುವ ಎನ್ ಹೆಚ್ 150 ...,16.752914,77.127311
301757,Yadgir,Yadgiri Traffic PS,10978215920230030,2023,1,2,Road Accidents,Cross roads,City/Town,Near office complex,...,,Hit and Run,NEAR TAMILUNADU BANK,2 KM,,1.5KM,,ಪೂರ್ವಕ್ಕೆ- ದೋಖಾ ಶಾಲೆ ಕಡೆ ಹೋಗುವ ರಸ್ತೆ ಇರುತ್ತದೆ....,0.000000,0.000000
301758,Yadgir,Yadgiri Traffic PS,10978215920230037,2023,1,1,Road Accidents,Not Applicable,City/Town,Near a factory industrial area,...,,Skidding or Self accident,"Yadgir-Hyderabad Road, Near Saidapur Hotel Yadgir",Near Saidapur Hotel Yadgir,,50 mt,,"Yadgir-Hyderabad Road, Near Saidapur Hotel Yadgir",0.000000,0.000000
301759,Yadgir,Yadgiri Traffic PS,10978215920230044,2023,1,1,Road Accidents,Cross roads,City/Town,Residential area,...,,Run Off Road,NEAR GUNJ 2nd GATE YADAGIRI,03 KM,,03 KM,,ಪೂರ್ವಕ್ಕೆ-ಶ್ರೀ ಬಸವೇಶ್ವರ ಎ.ಪಿ.ಎಮ್.ಸಿ (ಗಂಜ್) ಎರಡ...,16.763682,77.147579


### **Converting** all the **Not Applicable** values in the Dataset to **NaN** making it easier to work on them.

In [4]:
accident.replace('Not Applicable', np.nan, inplace=True)

### Converting the Dataset to Boolean making it easier to read **Nan** as **NaN** will be termed as a *True* value and rest will be seen as *False* and storing it in a new variable named **missing_data**.

In [5]:
missing_data = accident.isnull()
missing_data

Unnamed: 0,DISTRICTNAME,UNITNAME,Crime_No,Year,RI,Noofvehicle_involved,Accident_Classification,Accident_Spot,Accident_Location,Accident_SubLocation,...,RoadJunction,Collision_TypeB,Accident_Road,Landmark_first,landmark_second,Distance_LandMark_First,Distance_LandMark_Second,Accident_Description,Latitude,Longitude
0,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,True,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,True,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,True,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,True,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301756,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,True,False,False,False
301757,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,True,False,False,False
301758,False,False,False,False,False,False,False,True,False,False,...,True,False,False,False,True,False,True,False,False,False
301759,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,True,False,False,False


### Printing the number of NaN values in all the Columns amd checking for the **Datatype** of the values stored in all the columns.

In [6]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

DISTRICTNAME
DISTRICTNAME
False    301761
Name: count, dtype: int64

UNITNAME
UNITNAME
False    301761
Name: count, dtype: int64

Crime_No
Crime_No
False    301761
Name: count, dtype: int64

Year
Year
False    301761
Name: count, dtype: int64

RI
RI
False    301761
Name: count, dtype: int64

Noofvehicle_involved
Noofvehicle_involved
False    301761
Name: count, dtype: int64

Accident_Classification
Accident_Classification
False    281669
True      20092
Name: count, dtype: int64

Accident_Spot
Accident_Spot
False    223277
True      78484
Name: count, dtype: int64

Accident_Location
Accident_Location
False    295107
True       6654
Name: count, dtype: int64

Accident_SubLocation
Accident_SubLocation
False    301760
True          1
Name: count, dtype: int64

Accident_SpotB
Accident_SpotB
False    176684
True     125077
Name: count, dtype: int64

Main_Cause
Main_Cause
False    247837
True      53924
Name: count, dtype: int64

Hit_Run
Hit_Run
False    247227
True      54534
Name: count, d

### Dropping of Columns based on :


*   Required input columns making sense for **Prediction**.
*   Required output columns making sense for **Prediction**.
*   Number of **Nan** values as more number of redundant values would hamper the model.






In [7]:
drop_columns = ['Year','Accident_Description','Distance_LandMark_Second','Distance_LandMark_First','landmark_second',
                'Landmark_first','Accident_Road','Collision_TypeB','RoadJunction','Side_Walk','Spot_Conditions',
                'Road_Markings','Lane_Type','Road_Condition','Surface_Condition','Surface_Type','Road_Type','UNITNAME',
                'RI','Accident_Classification','Accident_Spot','Accident_SpotB','Main_Cause','Hit_Run','Collision_Type',
                'Junction_Control','Accident_SubLocation','Road_Character']
accident.drop(columns = drop_columns, axis=1, inplace=True)

Seeing that the Columns left are :

1.   DISTRICTNAME
1.   Crime_No
1.   Noofvehicle_involved
2.   Accident_Location
2.   Severity
2.   Weather
1.   Latitude
2.   Longitude





In [8]:
accident

Unnamed: 0,DISTRICTNAME,Crime_No,Noofvehicle_involved,Accident_Location,Severity,Weather,Latitude,Longitude
0,Bagalkot,10470124520160139,1,Rural Areas,Grievous Injury,Clear,0.000000,0.000000
1,Bagalkot,10470124520160143,1,Villages settlement,Fatal,Light Rain,0.000000,0.000000
2,Bagalkot,10470124520160056,2,City/Town,Damage Only,Clear,0.000000,0.000000
3,Bagalkot,10470124520160134,2,Rural Areas,Damage Only,Clear,0.000000,0.000000
4,Bagalkot,10470124520160161,1,City/Town,Fatal,Clear,0.000000,0.000000
...,...,...,...,...,...,...,...,...
301756,Yadgir,10978215920230010,2,City/Town,Grievous Injury,Clear,16.752914,77.127311
301757,Yadgir,10978215920230030,2,City/Town,Grievous Injury,Fine,0.000000,0.000000
301758,Yadgir,10978215920230037,1,City/Town,Fatal,Fine,0.000000,0.000000
301759,Yadgir,10978215920230044,1,City/Town,Grievous Injury,Clear,16.763682,77.147579


### Just checking in for unique values in a column to see if any unknown values are'nt left.

In [9]:
unique_values = accident['Weather'].value_counts()

print("Unique values and their frequencies:")
print(unique_values)


Unique values and their frequencies:
Weather
Clear                            129320
Fine                              97032
Others                            43773
Cloudy                             7699
Very Hot                           2832
Light Rain                         2226
Very Cold                          1732
Fog / Mist                         1259
Snow                               1140
Wind                                711
Dust Storn                          636
Mist or Fog                         592
Heavy Rain                          496
Flooding of Slipways/Rivulets       160
Strong Wind                         133
Hail or Sleet                       113
Name: count, dtype: int64


In [10]:
unique_values = accident['Severity'].value_counts()

print("Unique values and their frequencies:")
print(unique_values)

Unique values and their frequencies:
Severity
Grievous Injury    128846
Simple Injury       70954
Fatal               68604
Damage Only         24607
BUDDHISTS               2
ACHARI                  1
Roof                    1
Father                  1
MEDARA                  1
Others                  1
Name: count, dtype: int64


### Seeing the values in the Severity column we see that the unique values are classified and out of which some have very few occurunces or have no meaning at all.

In [11]:
accident.drop(accident[accident['Severity'] == 'BUDDHISTS'].index, inplace=True)
accident.reset_index(drop=True, inplace=True)

accident.drop(accident[accident['Severity'] == 'ACHARI'].index, inplace=True)
accident.reset_index(drop=True, inplace=True)

accident.drop(accident[accident['Severity'] == 'Roof'].index, inplace=True)
accident.reset_index(drop=True, inplace=True)

accident.drop(accident[accident['Severity'] == 'Father'].index, inplace=True)
accident.reset_index(drop=True, inplace=True)

accident.drop(accident[accident['Severity'] == 'MEDARA'].index, inplace=True)
accident.reset_index(drop=True, inplace=True)

accident.drop(accident[accident['Severity'] == 'Others'].index, inplace=True)
accident.reset_index(drop=True, inplace=True)

### **Removing** these values for a better model.

In [12]:
unique_values = accident['Severity'].value_counts()

print("Unique values and their frequencies:")
print(unique_values)

Unique values and their frequencies:
Severity
Grievous Injury    128846
Simple Injury       70954
Fatal               68604
Damage Only         24607
Name: count, dtype: int64


In [13]:
unique_values = accident['Accident_Location'].value_counts()

print("Unique values and their frequencies:")
print(unique_values)

Unique values and their frequencies:
Accident_Location
Rural Areas            171882
City/Town              111939
Villages settlement     11279
Name: count, dtype: int64


### Checking on the datatypes of the values in the columns we are working on.

In [14]:
print(accident.dtypes)

DISTRICTNAME             object
Crime_No                  int64
Noofvehicle_involved      int64
Accident_Location        object
Severity                 object
Weather                  object
Latitude                float64
Longitude               float64
dtype: object


### This gives us the overview of our Dataset like the mean, frequency and many more.

In [15]:
accident.describe(include = 'all')

Unnamed: 0,DISTRICTNAME,Crime_No,Noofvehicle_involved,Accident_Location,Severity,Weather,Latitude,Longitude
count,301754,301754.0,301754.0,295100,293011,289847,301754.0,301754.0
unique,38,,,3,4,16,,
top,Bengaluru City,,,Rural Areas,Grievous Injury,Clear,,
freq,34526,,,171882,128846,129318,,
mean,,1.053233e+16,1.578816,,,,4.578621,24.353332
std,,188418800000000.0,0.615193,,,,6.960341,35.572835
min,,1.043801e+16,1.0,,,,0.0,0.0
25%,,1.044415e+16,1.0,,,,0.0,0.0
50%,,1.045522e+16,2.0,,,,0.0,0.0
75%,,1.046416e+16,2.0,,,,12.941544,75.058825


### Working on the NaN values.

In [16]:
missing_data = accident.isnull()
missing_data

Unnamed: 0,DISTRICTNAME,Crime_No,Noofvehicle_involved,Accident_Location,Severity,Weather,Latitude,Longitude
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
301749,False,False,False,False,False,False,False,False
301750,False,False,False,False,False,False,False,False
301751,False,False,False,False,False,False,False,False
301752,False,False,False,False,False,False,False,False


In [17]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

DISTRICTNAME
DISTRICTNAME
False    301754
Name: count, dtype: int64

Crime_No
Crime_No
False    301754
Name: count, dtype: int64

Noofvehicle_involved
Noofvehicle_involved
False    301754
Name: count, dtype: int64

Accident_Location
Accident_Location
False    295100
True       6654
Name: count, dtype: int64

Severity
Severity
False    293011
True       8743
Name: count, dtype: int64

Weather
Weather
False    289847
True      11907
Name: count, dtype: int64

Latitude
Latitude
False    301754
Name: count, dtype: int64

Longitude
Longitude
False    301754
Name: count, dtype: int64



### In the **Latitude** and **Longitude** column we can see the there are no null values but there are values with zero which will be considered as **outliers** of the dataset.

In [18]:
accident = accident[(accident['Latitude'] != 0) & (accident['Longitude'] != 0)]

# Optionally, you can reset the index if you want
accident.reset_index(drop=True, inplace=True)


### Rest removing the other NaN values as the values in these columns are strings so the methods of data filling in the dataset would not work.

In [19]:
accident.dropna(subset=["Accident_Location"], axis=0, inplace=True)

accident.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accident.dropna(subset=["Accident_Location"], axis=0, inplace=True)


In [20]:
accident.dropna(subset=["Severity"], axis=0, inplace=True)

accident.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accident.dropna(subset=["Severity"], axis=0, inplace=True)


In [21]:
accident.dropna(subset=["Weather"], axis=0, inplace=True)

accident.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accident.dropna(subset=["Weather"], axis=0, inplace=True)


### Now there are no NaN values in our dataset.

In [22]:
missing_data = accident.isnull()
missing_data.head(5)

for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

DISTRICTNAME
DISTRICTNAME
False    92705
Name: count, dtype: int64

Crime_No
Crime_No
False    92705
Name: count, dtype: int64

Noofvehicle_involved
Noofvehicle_involved
False    92705
Name: count, dtype: int64

Accident_Location
Accident_Location
False    92705
Name: count, dtype: int64

Severity
Severity
False    92705
Name: count, dtype: int64

Weather
Weather
False    92705
Name: count, dtype: int64

Latitude
Latitude
False    92705
Name: count, dtype: int64

Longitude
Longitude
False    92705
Name: count, dtype: int64



### Getting the information on our cleaned dataset.

In [23]:
accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92705 entries, 0 to 92704
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   DISTRICTNAME          92705 non-null  object 
 1   Crime_No              92705 non-null  int64  
 2   Noofvehicle_involved  92705 non-null  int64  
 3   Accident_Location     92705 non-null  object 
 4   Severity              92705 non-null  object 
 5   Weather               92705 non-null  object 
 6   Latitude              92705 non-null  float64
 7   Longitude             92705 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 5.7+ MB


In [24]:
DISTRICTNAME_dummies = pd.get_dummies(accident['DISTRICTNAME']);
DISTRICTNAME_dummies

Unnamed: 0,Bagalkot,Ballari,Belagavi City,Belagavi Dist,Bengaluru City,Bengaluru Dist,Bidar,Chamarajanagar,Chickballapura,Chikkamagaluru,...,Mysuru Dist,Raichur,Ramanagara,Shivamogga,Tumakuru,Udupi,Uttara Kannada,Vijayanagara,Vijayapur,Yadgir
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92700,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
92701,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
92702,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
92703,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### Since the dataset we have contains columns which have their datatype as string and the values in these columns have no intra relation so we chose to **one hot encode** them and replace them with the original columns.

In [25]:
accident = pd.concat([accident,DISTRICTNAME_dummies],axis = 1)

In [26]:
accident.drop('DISTRICTNAME',axis = 1,inplace = True)

In [27]:
DISTRICTNAME_dummies = pd.get_dummies(accident['Accident_Location']);
accident = pd.concat([accident,DISTRICTNAME_dummies],axis = 1)
accident.drop('Accident_Location',axis = 1,inplace = True)

In [28]:
DISTRICTNAME_dummies = pd.get_dummies(accident['Weather']);
accident = pd.concat([accident,DISTRICTNAME_dummies],axis = 1)
accident.drop('Weather',axis = 1,inplace = True)

In [29]:
DISTRICTNAME_dummies = pd.get_dummies(accident['Severity']);
accident = pd.concat([accident,DISTRICTNAME_dummies],axis = 1)
accident.drop('Severity',axis = 1,inplace = True)

### This is the **one hot encoded** dataset.

In [30]:
accident

Unnamed: 0,Crime_No,Noofvehicle_involved,Latitude,Longitude,Bagalkot,Ballari,Belagavi City,Belagavi Dist,Bengaluru City,Bengaluru Dist,...,Others,Snow,Strong Wind,Very Cold,Very Hot,Wind,Damage Only,Fatal,Grievous Injury,Simple Injury
0,10470124520190064,1,16.063833,75.924106,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,10470124520190069,2,15.899076,75.932457,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,10470124520190075,3,16.098510,75.857045,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
3,10470124520190046,1,15.985381,75.870481,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,10470124520190055,1,16.097321,75.948619,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92700,10978215920230057,1,16.741806,77.124326,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
92701,10978215920230059,2,16.765804,77.132050,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
92702,10978215920230070,2,16.477004,77.108417,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
92703,10978215920230010,2,16.752914,77.127311,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


### As now these column values are in boolean (i.e in 1's and 0's), so working on them is easier.

In [31]:
accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92705 entries, 0 to 92704
Data columns (total 65 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Crime_No                       92705 non-null  int64  
 1   Noofvehicle_involved           92705 non-null  int64  
 2   Latitude                       92705 non-null  float64
 3   Longitude                      92705 non-null  float64
 4   Bagalkot                       92705 non-null  bool   
 5   Ballari                        92705 non-null  bool   
 6   Belagavi City                  92705 non-null  bool   
 7   Belagavi Dist                  92705 non-null  bool   
 8   Bengaluru City                 92705 non-null  bool   
 9   Bengaluru Dist                 92705 non-null  bool   
 10  Bidar                          92705 non-null  bool   
 11  Chamarajanagar                 92705 non-null  bool   
 12  Chickballapura                 92705 non-null 

In [32]:
column_names = ['Noofvehicle_involved','Bagalkot','Ballari','Belagavi City','Belagavi Dist','Bengaluru City','Bengaluru Dist','Bidar','Chamarajanagar','Chickballapura','Chikkamagaluru',
                 'Chitradurga','Dakshina Kannada','Davanagere','Dharwad','Gadag','Hassan','Haveri','Hubballi Dharwad City','K.G.F','Kalaburagi','Kalaburagi City','Karnataka Railways',
                 'Kodagu','Kolar','Koppal','Mandya','Mangaluru City','Mysuru City','Mysuru Dist','Raichur','Ramanagara','Shivamogga','Tumakuru','Udupi','Uttara Kannada','Vijayanagara',
                 'Vijayapur','Yadgir','City/Town','Rural Areas','Villages settlement','Clear','Cloudy','Dust Storn','Fine','Flooding of Slipways/Rivulets','Fog / Mist','Hail or Sleet',
                'Heavy Rain','Light Rain','Mist or Fog','Others','Snow','Strong Wind','Very Cold','Very Hot','Wind']

for i in column_names:
    accident[i] = accident[i].astype(int)

In [33]:
accident

Unnamed: 0,Crime_No,Noofvehicle_involved,Latitude,Longitude,Bagalkot,Ballari,Belagavi City,Belagavi Dist,Bengaluru City,Bengaluru Dist,...,Others,Snow,Strong Wind,Very Cold,Very Hot,Wind,Damage Only,Fatal,Grievous Injury,Simple Injury
0,10470124520190064,1,16.063833,75.924106,1,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
1,10470124520190069,2,15.899076,75.932457,1,0,0,0,0,0,...,1,0,0,0,0,0,False,False,True,False
2,10470124520190075,3,16.098510,75.857045,1,0,0,0,0,0,...,1,0,0,0,0,0,False,False,True,False
3,10470124520190046,1,15.985381,75.870481,1,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
4,10470124520190055,1,16.097321,75.948619,1,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92700,10978215920230057,1,16.741806,77.124326,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
92701,10978215920230059,2,16.765804,77.132050,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
92702,10978215920230070,2,16.477004,77.108417,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
92703,10978215920230010,2,16.752914,77.127311,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False


### Dropping off the **Outliers** from the Dataset.

In [34]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(accident)
mask = yhat != -1
accident = accident[mask]
accident.reset_index(drop=True, inplace=True)
accident



Unnamed: 0,Crime_No,Noofvehicle_involved,Latitude,Longitude,Bagalkot,Ballari,Belagavi City,Belagavi Dist,Bengaluru City,Bengaluru Dist,...,Others,Snow,Strong Wind,Very Cold,Very Hot,Wind,Damage Only,Fatal,Grievous Injury,Simple Injury
0,10470124520190064,1,16.063833,75.924106,1,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
1,10470124520190069,2,15.899076,75.932457,1,0,0,0,0,0,...,1,0,0,0,0,0,False,False,True,False
2,10470124520190075,3,16.098510,75.857045,1,0,0,0,0,0,...,1,0,0,0,0,0,False,False,True,False
3,10470124520190046,1,15.985381,75.870481,1,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
4,10470124520190055,1,16.097321,75.948619,1,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83429,10978215920230013,2,16.775935,77.193939,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
83430,10978215920230055,2,16.753550,77.142326,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
83431,10978215920230042,2,16.758402,77.123453,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False
83432,10978215920230010,2,16.752914,77.127311,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,True,False


### Here we are seeing the quality of data by checking in for **duplicates**, here's why we have have kept the column for **Crime_No** it wont be used for the model but for the uniqueness of the data.

In [35]:
# Remove duplicate rows
accident.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accident.drop_duplicates(inplace=True)


In [36]:
accident.sort_values(by=accident.columns.tolist(), inplace=True)
diff = accident.diff()
duplicates = diff.all(axis=1)
print(duplicates.value_counts())

False    83433
True         1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accident.sort_values(by=accident.columns.tolist(), inplace=True)


### Splitting the Date into **training** and **testing**.

In [37]:
X = accident.iloc[:,1:-4]
y = accident.iloc[:,-4:]

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1,random_state = 73)

### **Feature Scaling**

In [40]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train.iloc[:, 1:-4] = sc.fit_transform(X_train.iloc[:, 1:-4])

In [41]:
X_test.iloc[:, 1:-4] = sc.transform(X_test.iloc[:, 1:-4])

### Using different models for preicting **Severity**.

In [42]:
from sklearn.ensemble import RandomForestRegressor                                 #using 1

# Create and train the multi-output random forest regressor
multi_output_rf = RandomForestRegressor(n_estimators=100, random_state=42)
multi_output_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = multi_output_rf.predict(X_test)

# Evaluate the model
# You can use any evaluation metric suitable for your problem (e.g., RMSE, R^2 score, etc.)
# For example, you can use mean squared error (MSE) for each output variable
mse = ((y_test - y_pred) ** 2).mean(axis=0)
print("Mean Squared Error (MSE) for each output variable:")
print(mse)


Mean Squared Error (MSE) for each output variable:
Damage Only        0.074763
Fatal              0.193376
Grievous Injury    0.259327
Simple Injury      0.150554
dtype: float64


#### Accuracy for each output variable :
#### Damage Only        0.938039
#### Fatal              0.735499
#### Grievous Injury    0.608941
#### Simple Injury      0.792306
**Aggeregate Accuracy = 76.80**

In [43]:
# Round the predicted values to the nearest integer
y_pred_rounded = np.round(y_pred)

# Convert the regression problem into a classification problem
y_test_class = np.round(y_test)

# Calculate the accuracy
accuracy = (y_pred_rounded == y_test_class).mean()
print("Accuracy:", accuracy)


Accuracy: Damage Only        0.909636
Fatal              0.727229
Grievous Injury    0.605465
Simple Injury      0.794104
dtype: float64


In [44]:
import pandas as pd

# Assuming 'new_data' is your new data for prediction
new_data = pd.DataFrame({
    'Noofvehicle_involved': ['1'],
    'Latitude': ['16.063833'],
    'Longitude': ['75.924106'],
    'DISTRICTNAME': ['Bagalkot'],
    'Accident_Location': ['Rural Areas'],
    'Weather': ['Clear']
})

# One-hot encode categorical variables
new_data_encoded = pd.get_dummies(new_data, columns=['Noofvehicle_involved','Latitude','Longitude','DISTRICTNAME', 'Accident_Location', 'Weather'])

# Ensure consistent column order with training data
new_data_aligned = new_data_encoded.reindex(columns=X_train.columns, fill_value=0)

# Make predictions
y_pred = multi_output_rf.predict(new_data_aligned)

# Display the predictions
print("Predictions:")
for i, col in enumerate(y.columns):
    print(f"{col}: {y_pred[0][i]}")


Predictions:
Damage Only: 0.03
Fatal: 0.49
Grievous Injury: 0.14
Simple Injury: 0.34


In [45]:
from sklearn.preprocessing import StandardScaler                                                          #using 2
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Standardize the input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the number of output variables
num_outputs = len(y_train.columns)

# Build the neural network model
inputs = Input(shape=(X_train_scaled.shape[1],))
hidden = Dense(64, activation='relu')(inputs)
hidden = Dense(32, activation='relu')(hidden)
outputs = [Dense(1, activation='linear', name=f'output_{i}')(hidden) for i in range(num_outputs)]

# Create the model
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, [y_train[col] for col in y_train.columns], epochs=100, batch_size=32, verbose=1)

# Evaluate the model on the test set
loss = model.evaluate(X_test_scaled, [y_test[col] for col in y_test.columns], verbose=0)
print("Test Loss:", loss)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [46]:
import pandas as pd

# Assuming 'new_data' is your new data for prediction
new_data = pd.DataFrame({
    'Noofvehicle_involved': ['1'],
    'Latitude': ['16.063833'],
    'Longitude': ['75.924106'],
    'DISTRICTNAME': ['Yadgir'],
    'Accident_Location': ['Rural Areas'],
    'Weather': ['Clear']
})

# One-hot encode categorical variables
new_data_encoded = pd.get_dummies(new_data, columns=['Noofvehicle_involved','Latitude','Longitude','DISTRICTNAME', 'Accident_Location', 'Weather'])

# Ensure consistent column order with training data
new_data_aligned = new_data_encoded.reindex(columns=X_train.columns, fill_value=0)

# Standardize the input features
new_data_scaled = scaler.transform(new_data_aligned)

# Make predictions
predictions = model.predict(new_data_scaled)

# Display the predictions
print("Predictions:")
for i, col in enumerate(y.columns):
    print(f"{col}: {predictions[i]}")


Predictions:
Damage Only: [[0.04405765]]
Fatal: [[0.528998]]
Grievous Injury: [[0.3255911]]
Simple Injury: [[0.10691943]]
