# RoadSafe Analytics: US Accidents EDA - Initial **Exploration**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
accidents_df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/US_Accidents_March23.csv')

In [10]:
display(accidents_df.head())

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


Explore Dataset Structure

In [11]:
# Print dataset shape
print(f"\nDataset has {accidents_df.shape[0]} rows and {accidents_df.shape[1]} columns")

# List all columns with their data types
print("\nColumn Data Types:")
print(accidents_df.dtypes)

# More detailed info including non-null counts
print("\nDataset Info:")
accidents_df.info()


Dataset has 7728394 rows and 46 columns

Column Data Types:
ID                        object
Source                    object
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
End_Lat                  float64
End_Lng                  float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Weather_Timestamp         object
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Precipitation(in)        float64
Weather_Conditi

Check Missing Data and Visualization

In [12]:
# Calculate missing values and percentage missing
missing_values = accidents_df.isnull().sum()
missing_percent = (missing_values / len(accidents_df) * 100)

missing_summary = pd.DataFrame({
    'Missing Values': missing_values,
    'Percent Missing': missing_percent
})

# Filter columns with missing data
missing_summary = missing_summary[missing_summary['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

print("\nColumns with Missing Data:")
display(missing_summary)


Columns with Missing Data:


Unnamed: 0,Missing Values,Percent Missing
End_Lat,3402762,44.029355
End_Lng,3402762,44.029355
Precipitation(in),2203586,28.512858
Wind_Chill(F),1999019,25.865904
Wind_Speed(mph),571233,7.391355
Visibility(mi),177098,2.291524
Wind_Direction,175206,2.267043
Humidity(%),174144,2.253301
Weather_Condition,173459,2.244438
Temperature(F),163853,2.120143


In [13]:
# Explore severity distribution
print("\nAccident Severity Distribution:")
print(accidents_df['Severity'].value_counts().sort_index())

# Check unique weather conditions
print("\nUnique Weather Conditions:")
print(accidents_df['Weather_Condition'].unique())

# Convert Start_Time to datetime and extract hour for future analysis
accidents_df['Start_Time'] = pd.to_datetime(accidents_df['Start_Time'], errors='coerce')
accidents_df['Hour'] = accidents_df['Start_Time'].dt.hour

print("\nSample data with Hour extracted from Start_Time:")
display(accidents_df[['Start_Time', 'Hour']].head())


Accident Severity Distribution:
Severity
1      67366
2    6156981
3    1299337
4     204710
Name: count, dtype: int64

Unique Weather Conditions:
['Light Rain' 'Overcast' 'Mostly Cloudy' 'Rain' 'Light Snow' 'Haze'
 'Scattered Clouds' 'Partly Cloudy' 'Clear' 'Snow'
 'Light Freezing Drizzle' 'Light Drizzle' 'Fog' 'Shallow Fog' 'Heavy Rain'
 'Light Freezing Rain' 'Cloudy' 'Drizzle' nan 'Light Rain Showers' 'Mist'
 'Smoke' 'Patches of Fog' 'Light Freezing Fog' 'Light Haze'
 'Light Thunderstorms and Rain' 'Thunderstorms and Rain' 'Fair'
 'Volcanic Ash' 'Blowing Sand' 'Blowing Dust / Windy' 'Widespread Dust'
 'Fair / Windy' 'Rain Showers' 'Mostly Cloudy / Windy'
 'Light Rain / Windy' 'Hail' 'Heavy Drizzle' 'Showers in the Vicinity'
 'Thunderstorm' 'Light Rain Shower' 'Light Rain with Thunder'
 'Partly Cloudy / Windy' 'Thunder in the Vicinity' 'T-Storm'
 'Heavy Thunderstorms and Rain' 'Thunder' 'Heavy T-Storm' 'Funnel Cloud'
 'Heavy T-Storm / Windy' 'Blowing Snow' 'Light Thunderstorms and S

Unnamed: 0,Start_Time,Hour
0,2016-02-08 05:46:00,5.0
1,2016-02-08 06:07:59,6.0
2,2016-02-08 06:49:27,6.0
3,2016-02-08 07:23:34,7.0
4,2016-02-08 07:39:07,7.0
