In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'us-accidents:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F199387%2F5793796%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240401%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240401T142025Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D40c1055ac0e2a7f7a897cda8dfbf87589bdb4ae872e91aa79b2edf0d4b6e2df11d8fcf45a271ca0b19346afa0f933b9ad114333aa628e410cbfbe51cc536e0a2ca559781031f6f4de837c4d1d7c29ddefc94e140507abd8da50dd148bb3de486900576430792ae357967148dd923cc62c56555fb901be17c2e153314fd6159a70c69f6fdac162f7b9da8b6edcbcaaecd14913d1635c7e1691b09c7b1c13d316589190b9e29d0833633d499ec17956c1b0f8b4977972f4722c3da176bfe103647d90e1134ad79c7179c687c975a96dc9890e2c260f47767c7e7ae7ca36b48934f8a42d977deb2bd4459753b77722c2b5d8323bebed58df9c706800d316e7558eb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading us-accidents, 684855912 bytes compressed
Downloaded and uncompressed: us-accidents
Data source import complete.


In [2]:
#importing the library
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [3]:
#loading the dataset
df=pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv',nrows=10000)

In [4]:
df

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,A-9996,Source2,2,2017-01-06 16:01:06,2017-01-06 16:30:34,38.701267,-121.077751,,,0.00,...,False,False,False,False,False,False,Day,Day,Day,Day
9996,A-9997,Source2,2,2017-01-06 16:14:00,2017-01-06 16:43:38,36.981407,-122.011192,,,0.01,...,False,True,False,False,False,False,Day,Day,Day,Day
9997,A-9998,Source2,3,2017-01-06 16:08:58,2017-01-06 16:38:48,37.326691,-121.940720,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
9998,A-9999,Source2,3,2017-01-06 16:25:01,2017-01-06 16:54:51,37.930088,-122.324036,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day


In [5]:
#getting the preliminary information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     10000 non-null  object 
 1   Source                 10000 non-null  object 
 2   Severity               10000 non-null  int64  
 3   Start_Time             10000 non-null  object 
 4   End_Time               10000 non-null  object 
 5   Start_Lat              10000 non-null  float64
 6   Start_Lng              10000 non-null  float64
 7   End_Lat                0 non-null      float64
 8   End_Lng                0 non-null      float64
 9   Distance(mi)           10000 non-null  float64
 10  Description            10000 non-null  object 
 11  Street                 10000 non-null  object 
 12  City                   10000 non-null  object 
 13  County                 10000 non-null  object 
 14  State                  10000 non-null  object 
 15  Zip

In [6]:
#checking for missing values
df.isna().sum()

ID                           0
Source                       0
Severity                     0
Start_Time                   0
End_Time                     0
Start_Lat                    0
Start_Lng                    0
End_Lat                  10000
End_Lng                  10000
Distance(mi)                 0
Description                  0
Street                       0
City                         0
County                       0
State                        0
Zipcode                      1
Country                      0
Timezone                     1
Airport_Code                 1
Weather_Timestamp           31
Temperature(F)              85
Wind_Chill(F)             8574
Humidity(%)                124
Pressure(in)                35
Visibility(mi)              96
Wind_Direction              31
Wind_Speed(mph)           1774
Precipitation(in)         8877
Weather_Condition           77
Amenity                      0
Bump                         0
Crossing                     0
Give_Way

In [7]:
df.isna().mean()

ID                       0.0000
Source                   0.0000
Severity                 0.0000
Start_Time               0.0000
End_Time                 0.0000
Start_Lat                0.0000
Start_Lng                0.0000
End_Lat                  1.0000
End_Lng                  1.0000
Distance(mi)             0.0000
Description              0.0000
Street                   0.0000
City                     0.0000
County                   0.0000
State                    0.0000
Zipcode                  0.0001
Country                  0.0000
Timezone                 0.0001
Airport_Code             0.0001
Weather_Timestamp        0.0031
Temperature(F)           0.0085
Wind_Chill(F)            0.8574
Humidity(%)              0.0124
Pressure(in)             0.0035
Visibility(mi)           0.0096
Wind_Direction           0.0031
Wind_Speed(mph)          0.1774
Precipitation(in)        0.8877
Weather_Condition        0.0077
Amenity                  0.0000
Bump                     0.0000
Crossing

In [8]:
null_columns=['End_Lat','End_Lng','Precipitation(in)','Wind_Chill(F)']
df=df.drop(null_columns,axis=1)

In [9]:
df

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,A-9996,Source2,2,2017-01-06 16:01:06,2017-01-06 16:30:34,38.701267,-121.077751,0.00,Accident on Brackenwood Pl at Melrose Way.,Brackenwood Pl,...,False,False,False,False,False,False,Day,Day,Day,Day
9996,A-9997,Source2,2,2017-01-06 16:14:00,2017-01-06 16:43:38,36.981407,-122.011192,0.01,Accident on Water St at Benito Ave.,Water St,...,False,True,False,False,False,False,Day,Day,Day,Day
9997,A-9998,Source2,3,2017-01-06 16:08:58,2017-01-06 16:38:48,37.326691,-121.940720,0.01,Accident on I-880 Northbound at Exits 1A 1B 1C...,I-880 N,...,False,False,False,False,False,False,Day,Day,Day,Day
9998,A-9999,Source2,3,2017-01-06 16:25:01,2017-01-06 16:54:51,37.930088,-122.324036,0.01,Accident on I-80 Eastbound at Exit 15 Cutting ...,I-80 E,...,False,False,False,False,False,False,Day,Day,Day,Day


In [10]:
df=df.dropna(axis=0).reset_index(drop=True)

In [11]:
df

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,...,False,False,False,False,True,False,Night,Night,Day,Day
1,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,...,False,False,False,False,False,False,Night,Day,Day,Day
2,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,...,False,False,False,False,True,False,Day,Day,Day,Day
3,A-6,Source2,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.100590,-82.925194,0.01,Accident on I-270 Outerbelt Northbound near Ex...,Westerville Rd,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-7,Source2,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,0.00,Accident on Oakridge Dr at Woodward Ave. Expec...,N Woodward Ave,...,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,A-9995,Source3,2,2017-01-06 16:00:55,2017-01-06 16:30:43,37.066490,-121.219147,0.01,Accident on CA-152 Pacheco Pass Hwy Westbound ...,Pacheco Pass Hwy,...,False,False,True,False,False,False,Day,Day,Day,Day
8126,A-9997,Source2,2,2017-01-06 16:14:00,2017-01-06 16:43:38,36.981407,-122.011192,0.01,Accident on Water St at Benito Ave.,Water St,...,False,True,False,False,False,False,Day,Day,Day,Day
8127,A-9998,Source2,3,2017-01-06 16:08:58,2017-01-06 16:38:48,37.326691,-121.940720,0.01,Accident on I-880 Northbound at Exits 1A 1B 1C...,I-880 N,...,False,False,False,False,False,False,Day,Day,Day,Day
8128,A-9999,Source2,3,2017-01-06 16:25:01,2017-01-06 16:54:51,37.930088,-122.324036,0.01,Accident on I-80 Eastbound at Exit 15 Cutting ...,I-80 E,...,False,False,False,False,False,False,Day,Day,Day,Day


In [None]:
df.isna().sum()

In [None]:
#checking for unique values in each columns
{column:len(df[column].unique()) for column in df.columns if df.dtypes[column]=='object'}

In [12]:
#removing unnecessary column

unneeded_columns=['ID','Description','Street','City','Zipcode','Country']


df=df.drop(unneeded_columns,axis=1)

In [13]:
df

Unnamed: 0,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),County,State,Timezone,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Clermont,OH,US/Eastern,...,False,False,False,False,True,False,Night,Night,Day,Day
1,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Montgomery,OH,US/Eastern,...,False,False,False,False,False,False,Night,Day,Day,Day
2,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Montgomery,OH,US/Eastern,...,False,False,False,False,True,False,Day,Day,Day,Day
3,Source2,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.100590,-82.925194,0.01,Franklin,OH,US/Eastern,...,False,False,False,False,False,False,Day,Day,Day,Day
4,Source2,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,0.00,Montgomery,OH,US/Eastern,...,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,Source3,2,2017-01-06 16:00:55,2017-01-06 16:30:43,37.066490,-121.219147,0.01,Santa Clara,CA,US/Pacific,...,False,False,True,False,False,False,Day,Day,Day,Day
8126,Source2,2,2017-01-06 16:14:00,2017-01-06 16:43:38,36.981407,-122.011192,0.01,Santa Cruz,CA,US/Pacific,...,False,True,False,False,False,False,Day,Day,Day,Day
8127,Source2,3,2017-01-06 16:08:58,2017-01-06 16:38:48,37.326691,-121.940720,0.01,Santa Clara,CA,US/Pacific,...,False,False,False,False,False,False,Day,Day,Day,Day
8128,Source2,3,2017-01-06 16:25:01,2017-01-06 16:54:51,37.930088,-122.324036,0.01,Contra Costa,CA,US/Pacific,...,False,False,False,False,False,False,Day,Day,Day,Day


In [14]:
def get_years(df,column):
    return df[column].apply(lambda date:date[0:4])
def get_months(df,column):
    return df[column].apply(lambda date:date[5:7])

In [15]:
df['Start_time_month']=get_months(df,'Start_Time')
df['Start_time_year']=get_years(df,'Start_Time')
df['End_time_month']=get_months(df,'End_Time')
df['End_time_year']=get_years(df,'End_Time')
df['Weather_timestamp_month']=get_months(df,'Weather_Timestamp')
df['Weather_timestamp_year']=get_years(df,'Weather_Timestamp')



df=df.drop(['Start_Time','End_Time','Weather_Timestamp'],axis=1)

In [16]:
df

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),County,State,Timezone,Airport_Code,Temperature(F),...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Start_time_month,Start_time_year,End_time_month,End_time_year,Weather_timestamp_month,Weather_timestamp_year
0,Source2,2,39.063148,-84.032608,0.01,Clermont,OH,US/Eastern,KI69,36.0,...,Night,Night,Day,Day,02,2016,02,2016,02,2016
1,Source2,3,39.747753,-84.205582,0.01,Montgomery,OH,US/Eastern,KDAY,35.1,...,Night,Day,Day,Day,02,2016,02,2016,02,2016
2,Source2,2,39.627781,-84.188354,0.01,Montgomery,OH,US/Eastern,KMGY,36.0,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
3,Source2,3,40.100590,-82.925194,0.01,Franklin,OH,US/Eastern,KCMH,37.9,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
4,Source2,2,39.758274,-84.230507,0.00,Montgomery,OH,US/Eastern,KDAY,34.0,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,Source3,2,37.066490,-121.219147,0.01,Santa Clara,CA,US/Pacific,KCVH,48.2,...,Day,Day,Day,Day,01,2017,01,2017,01,2017
8126,Source2,2,36.981407,-122.011192,0.01,Santa Cruz,CA,US/Pacific,KWVI,52.0,...,Day,Day,Day,Day,01,2017,01,2017,01,2017
8127,Source2,3,37.326691,-121.940720,0.01,Santa Clara,CA,US/Pacific,KSJC,51.1,...,Day,Day,Day,Day,01,2017,01,2017,01,2017
8128,Source2,3,37.930088,-122.324036,0.01,Contra Costa,CA,US/Pacific,KCCR,44.1,...,Day,Day,Day,Day,01,2017,01,2017,01,2017


In [17]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [18]:
df=onehot_encode(df,columns=['County','State','Timezone','Airport_Code','Wind_Direction','Weather_Condition'])

In [19]:
df

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),...,Weather_Condition_Mist,Weather_Condition_Mostly Cloudy,Weather_Condition_Overcast,Weather_Condition_Partly Cloudy,Weather_Condition_Patches of Fog,Weather_Condition_Rain,Weather_Condition_Scattered Clouds,Weather_Condition_Shallow Fog,Weather_Condition_Smoke,Weather_Condition_Snow
0,Source2,2,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,3.5,...,0,0,1,0,0,0,0,0,0,0
1,Source2,3,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,4.6,...,0,1,0,0,0,0,0,0,0,0
2,Source2,2,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,3.5,...,0,1,0,0,0,0,0,0,0,0
3,Source2,3,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,3.5,...,0,0,0,0,0,0,0,0,0,0
4,Source2,2,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,3.5,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,Source3,2,37.066490,-121.219147,0.01,48.2,62.0,30.05,10.0,11.5,...,0,0,0,0,0,0,0,0,0,0
8126,Source2,2,36.981407,-122.011192,0.01,52.0,59.0,30.05,10.0,6.9,...,0,0,0,0,0,0,0,0,0,0
8127,Source2,3,37.326691,-121.940720,0.01,51.1,50.0,30.04,10.0,5.8,...,0,0,1,0,0,0,0,0,0,0
8128,Source2,3,37.930088,-122.324036,0.01,44.1,63.0,30.04,10.0,5.8,...,0,0,0,1,0,0,0,0,0,0


In [20]:
def get_binary_column(df,column):
    if column=='Source':
        return df[column].apply(lambda x:1 if x=='MapQuest' else 0)
    else:
        return df[column].apply(lambda x:1 if x=='Day' else 0)

In [21]:
df['Source']=get_binary_column(df,'Source')
df['Sunrise_Sunset']=get_binary_column(df,'Sunrise_Sunset')
df['Civil_Twilight']=get_binary_column(df,'Civil_Twilight')
df['Nautical_Twilight']=get_binary_column(df,'Nautical_Twilight')
df['Astronomical_Twilight']=get_binary_column(df,'Astronomical_Twilight')

In [22]:
df

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),...,Weather_Condition_Mist,Weather_Condition_Mostly Cloudy,Weather_Condition_Overcast,Weather_Condition_Partly Cloudy,Weather_Condition_Patches of Fog,Weather_Condition_Rain,Weather_Condition_Scattered Clouds,Weather_Condition_Shallow Fog,Weather_Condition_Smoke,Weather_Condition_Snow
0,0,2,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,3.5,...,0,0,1,0,0,0,0,0,0,0
1,0,3,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,4.6,...,0,1,0,0,0,0,0,0,0,0
2,0,2,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,3.5,...,0,1,0,0,0,0,0,0,0,0
3,0,3,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,3.5,...,0,0,0,0,0,0,0,0,0,0
4,0,2,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,3.5,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,0,2,37.066490,-121.219147,0.01,48.2,62.0,30.05,10.0,11.5,...,0,0,0,0,0,0,0,0,0,0
8126,0,2,36.981407,-122.011192,0.01,52.0,59.0,30.05,10.0,6.9,...,0,0,0,0,0,0,0,0,0,0
8127,0,3,37.326691,-121.940720,0.01,51.1,50.0,30.04,10.0,5.8,...,0,0,1,0,0,0,0,0,0,0
8128,0,3,37.930088,-122.324036,0.01,44.1,63.0,30.04,10.0,5.8,...,0,0,0,1,0,0,0,0,0,0


In [23]:
#Spliting and Scaling the dataset
y=df['Severity'].copy()
x=df.drop('Severity',axis=1).copy()


In [24]:
y.unique()

array([2, 3, 1, 4])

In [25]:
x

Unnamed: 0,Source,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Amenity,...,Weather_Condition_Mist,Weather_Condition_Mostly Cloudy,Weather_Condition_Overcast,Weather_Condition_Partly Cloudy,Weather_Condition_Patches of Fog,Weather_Condition_Rain,Weather_Condition_Scattered Clouds,Weather_Condition_Shallow Fog,Weather_Condition_Smoke,Weather_Condition_Snow
0,0,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,3.5,False,...,0,0,1,0,0,0,0,0,0,0
1,0,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,4.6,False,...,0,1,0,0,0,0,0,0,0,0
2,0,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,3.5,False,...,0,1,0,0,0,0,0,0,0,0
3,0,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,3.5,False,...,0,0,0,0,0,0,0,0,0,0
4,0,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,3.5,False,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,0,37.066490,-121.219147,0.01,48.2,62.0,30.05,10.0,11.5,False,...,0,0,0,0,0,0,0,0,0,0
8126,0,36.981407,-122.011192,0.01,52.0,59.0,30.05,10.0,6.9,False,...,0,0,0,0,0,0,0,0,0,0
8127,0,37.326691,-121.940720,0.01,51.1,50.0,30.04,10.0,5.8,False,...,0,0,1,0,0,0,0,0,0,0
8128,0,37.930088,-122.324036,0.01,44.1,63.0,30.04,10.0,5.8,False,...,0,0,0,1,0,0,0,0,0,0


In [26]:
y=y-1

In [27]:
y.unique()

array([1, 2, 0, 3])

In [28]:
x=x.astype(float)

In [29]:
x

Unnamed: 0,Source,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Amenity,...,Weather_Condition_Mist,Weather_Condition_Mostly Cloudy,Weather_Condition_Overcast,Weather_Condition_Partly Cloudy,Weather_Condition_Patches of Fog,Weather_Condition_Rain,Weather_Condition_Scattered Clouds,Weather_Condition_Shallow Fog,Weather_Condition_Smoke,Weather_Condition_Snow
0,0.0,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,3.5,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,4.6,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,3.5,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,3.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,3.5,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,0.0,37.066490,-121.219147,0.01,48.2,62.0,30.05,10.0,11.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8126,0.0,36.981407,-122.011192,0.01,52.0,59.0,30.05,10.0,6.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8127,0.0,37.326691,-121.940720,0.01,51.1,50.0,30.04,10.0,5.8,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8128,0.0,37.930088,-122.324036,0.01,44.1,63.0,30.04,10.0,5.8,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)

In [31]:
x

Unnamed: 0,Source,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Amenity,...,Weather_Condition_Mist,Weather_Condition_Mostly Cloudy,Weather_Condition_Overcast,Weather_Condition_Partly Cloudy,Weather_Condition_Patches of Fog,Weather_Condition_Rain,Weather_Condition_Scattered Clouds,Weather_Condition_Shallow Fog,Weather_Condition_Smoke,Weather_Condition_Snow
0,0.0,1.190386,3.218087,-0.027001,-1.440270,1.534788,-0.948900,0.409723,-1.169373,-0.103398,...,-0.027176,-0.270941,2.497907,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
1,0.0,2.079861,3.201961,-0.027001,-1.491268,1.373320,-1.037077,0.017863,-0.930098,-0.103398,...,-0.027176,3.690840,-0.400335,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
2,0.0,1.923987,3.203567,-0.027001,-1.440270,1.090752,-1.007685,-1.157717,-1.169373,-0.103398,...,-0.027176,3.690840,-0.400335,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
3,0.0,2.538285,3.321330,-0.027001,-1.332607,1.413687,-1.066470,-0.765857,-1.169373,-0.103398,...,-0.027176,-0.270941,-0.400335,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
4,0.0,2.093530,3.199637,-0.051822,-1.553598,1.534788,-0.978293,-0.765857,-1.169373,-0.103398,...,-0.027176,-0.270941,2.497907,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,0.0,-1.403775,-0.248776,-0.027001,-0.748964,0.000844,0.168010,0.409723,0.570808,-0.103398,...,-0.027176,-0.270941,-0.400335,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
8126,0.0,-1.514320,-0.322618,-0.027001,-0.533640,-0.120257,0.168010,0.409723,-0.429796,-0.103398,...,-0.027176,-0.270941,-0.400335,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
8127,0.0,-1.065709,-0.316048,-0.027001,-0.584638,-0.483559,0.138618,0.409723,-0.669071,-0.103398,...,-0.027176,-0.270941,2.497907,-0.307660,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356
8128,0.0,-0.281744,-0.351784,-0.027001,-0.981288,0.041211,0.138618,0.409723,-0.669071,-0.103398,...,-0.027176,-0.270941,-0.400335,3.250342,-0.015686,-0.11603,-0.218049,-0.015686,-0.015686,-0.029356


In [32]:
#training the model
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)

In [33]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(5691, 214)
(2439, 214)
(5691,)
(2439,)


In [34]:
x.shape

(8130, 214)

In [35]:
#training the model

inputs=tf.keras.Input(shape=(x.shape[1],))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(64,activation='relu')(x)
outputs=tf.keras.layers.Dense(4,activation='softmax')(x)
model=tf.keras.Model(inputs,outputs)
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [36]:
batch_size=32
epochs=20
history=model.fit(x_train,y_train,validation_split=0.2,batch_size=batch_size,
                 epochs=epochs,callbacks=[tf.keras.callbacks.ReduceLROnPlateau(),
                 tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3,restore_best_weights=True)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [37]:
print('Test Accuracy',model.evaluate(x_test,y_test,verbose=0))

Test Accuracy [0.4970923960208893, 0.7642476558685303]
