In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# US Accidents Exploratory Data Analysis

TODO - What is Exploratory Data Analysis

TODO - Talk about the dataset

  - Kaggle
  - info about accidents
  - can use inferences to prevent accidents

## Download the data

we can directly use the required dataset after importing it to our notebook in kaggle.

In [None]:
df = pd.read_csv('../input/us-accidents/US_Accidents_Dec20_Updated.csv')

In [None]:
df.head()

## Data Preparation and Cleaning

In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
# to get an overall idea of our dataset

df.info()

In [None]:
df.describe()

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = df.select_dtypes(include=numerics)

In [None]:
numeric_df.info()

find missing values and fix them

### Percentage of missing values per column

In [None]:
missing_percentages = df.isna().sum().sort_values(ascending= False)/len(df)
missing_percentages[missing_percentages!=0].plot(kind = 'barh')

we can drop all the columns with data which are null most of the time (50% or more) as we can't analyse them properly anyway.

or make sure which columns to use ( use columns with less amount of null values)

## Perform exploratory analysis

Columns we'll analyze
1. City
2. Start_Time
3. Start_Lat, Start_Lng
4. Temperature(F)
4. Weather_Condition

### City analysis

In [None]:
cities = df.City.unique()

len(cities)

In [None]:
cities_by_accident = df.City.value_counts()
cities_by_accident[40:60]
cities_by_accident[cities_by_accident == 7328]

In [None]:
cities_by_accident['New York']

In [None]:
'New York' in df.City

In [None]:
cities_by_accident[:20].plot(kind='barh')

In [None]:
df.groupby('State').count().sort_values('Number', ascending=False).head()['Number']

In [None]:
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
sns.distplot(cities_by_accident)

In [None]:
high_accident_cities = cities_by_accident[cities_by_accident > 10000]
low_accident_cities = cities_by_accident[cities_by_accident <= 10000]

In [None]:
len(high_accident_cities)

In [None]:
sns.histplot(high_accident_cities, log_scale=True)

In [None]:
sns.histplot(low_accident_cities, log_scale=True)

### Start Time analysis

In [None]:
df.Start_Time[0]

In [None]:
df.Start_Time = pd.to_datetime(df.Start_Time)

df.Start_Time[0]

In [None]:
sns.distplot(df.Start_Time.dt.hour, bins = 24,kde=False, norm_hist=True)

a high percentage of accidents occur in between 6AM to 10AM

probably because people are in a hurry to get to work, and also maybe because traffic is higher around this time

In [None]:
sns.distplot(df.Start_Time.dt.dayofweek, bins = 7,kde=False, norm_hist=True)

on weekends the number of accidents are less compared to weekdays as the number of people travelling are lower

#### Is the distribution of accidents by hour less on weekends as compared to the weekdays?

In [None]:
sundays_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 6]
sns.distplot(sundays_start_time.dt.hour, bins=24, kde=False, norm_hist=True)

In [None]:
monday_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 0]
sns.distplot(monday_start_time.dt.hour, bins=24, kde=False, norm_hist=True)

On sundays the peak times are around 10AM and 3PM unlike weekdays.

In [None]:
df_2019 = df[df.Start_Time.dt.year == 2019]
sns.distplot(df_2019.Start_Time.dt.month, bins=12, kde=False, norm_hist=True)

In [None]:
df.Severity.value_counts().plot(kind='pie')

most accidents that occur have a severity of level 2

### Start Latitude and Longitude

In [None]:
df.Start_Lat

In [None]:
df.Start_Lng

In [None]:
sample_df = df.sample(int(0.1 * len(df)))

In [None]:
sns.scatterplot(x=sample_df.Start_Lng, y=sample_df.Start_Lat, size=0.001)

In [None]:
import folium

In [None]:
lat, lon = df.Start_Lat[0], df.Start_Lng[0]
lat, lon

In [None]:
for x in df[['Start_Lat', 'Start_Lng']].sample(100).iteritems():
    print(x[1])

In [None]:
from folium.plugins import HeatMap

In [None]:
sample_df = df.sample(int(0.001 * len(df)))
lat_lon_pairs = list(zip(list(sample_df.Start_Lat), list(sample_df.Start_Lng)))

In [None]:
map = folium.Map()
HeatMap(lat_lon_pairs).add_to(map)
map

## Ask and answer questions about the data

1. Are there more accidents in warmer or colder regions?
2. Which states have the highest number of accidents? How about per capita?
3. Is NY there in the data? If yes, why is the accident count lower if this is the most populous city?
1. Among the top 100 cities by accident, which state do they belong to mostly?
2. At what time of the day do most accidents occur?
2. Which days of the week have the most accidents?
3. Which months of the year have the most accidents?
3. What is the trend of accidents year over year? Decreasing or Increasing?
1. When is accidents per unit traffic the highest ?

## Summarize your inferences and write a conclusion

## Insights:
+ Less than 5% of the cities have more than 10k of the total accidents worldwide
+ Over 1200 cities have reported just 1 accident
+ number of accidents per city decreases exponentially