In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
# Path to dataset CSV (update as needed)
dataset_path = "/content/drive/MyDrive/data/US_Accidents_March23.csv"

In [9]:
try:
    accidents_df = pd.read_csv(dataset_path)
    print(f'Dataset loaded successfully with shape: {accidents_df.shape}')
except FileNotFoundError:
    print(f'Dataset file not found at the path: {dataset_path}')

Dataset loaded successfully with shape: (7728394, 46)


In [10]:
accidents_df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


In [11]:
accidents_df.shape

(7728394, 46)

In [12]:
accidents_df.dtypes

Unnamed: 0,0
ID,object
Source,object
Severity,int64
Start_Time,object
End_Time,object
Start_Lat,float64
Start_Lng,float64
End_Lat,float64
End_Lng,float64
Distance(mi),float64


In [13]:
accidents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   Severity               int64  
 3   Start_Time             object 
 4   End_Time               object 
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object 
 11  Street                 object 
 12  City                   object 
 13  County                 object 
 14  State                  object 
 15  Zipcode                object 
 16  Country                object 
 17  Timezone               object 
 18  Airport_Code           object 
 19  Weather_Timestamp      object 
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)       

In [14]:
missing_values = accidents_df.isnull().sum()
missing_percent = (missing_values/len(accidents_df)*100)

missing_summary = pd.DataFrame({
    'Missing Values' : missing_values,
    'Missing Percent' : missing_percent
})

In [15]:
missing_summary

Unnamed: 0,Missing Values,Missing Percent
ID,0,0.0
Source,0,0.0
Severity,0,0.0
Start_Time,0,0.0
End_Time,0,0.0
Start_Lat,0,0.0
Start_Lng,0,0.0
End_Lat,3402762,44.029355
End_Lng,3402762,44.029355
Distance(mi),0,0.0


In [16]:
missing_summary = missing_summary[missing_summary['Missing Values']>0].sort_values(by='Missing Values', ascending=False)
missing_summary

Unnamed: 0,Missing Values,Missing Percent
End_Lat,3402762,44.029355
End_Lng,3402762,44.029355
Precipitation(in),2203586,28.512858
Wind_Chill(F),1999019,25.865904
Wind_Speed(mph),571233,7.391355
Visibility(mi),177098,2.291524
Wind_Direction,175206,2.267043
Humidity(%),174144,2.253301
Weather_Condition,173459,2.244438
Temperature(F),163853,2.120143


In [17]:
print("\nAccident Severity Distribution:")
print(accidents_df['Severity'].value_counts().sort_index())


Accident Severity Distribution:
Severity
1      67366
2    6156981
3    1299337
4     204710
Name: count, dtype: int64


In [18]:
print("\nUnique Weater Conditions:")
print(accidents_df['Weather_Condition'].unique())


Unique Weater Conditions:
['Light Rain' 'Overcast' 'Mostly Cloudy' 'Rain' 'Light Snow' 'Haze'
 'Scattered Clouds' 'Partly Cloudy' 'Clear' 'Snow'
 'Light Freezing Drizzle' 'Light Drizzle' 'Fog' 'Shallow Fog' 'Heavy Rain'
 'Light Freezing Rain' 'Cloudy' 'Drizzle' nan 'Light Rain Showers' 'Mist'
 'Smoke' 'Patches of Fog' 'Light Freezing Fog' 'Light Haze'
 'Light Thunderstorms and Rain' 'Thunderstorms and Rain' 'Fair'
 'Volcanic Ash' 'Blowing Sand' 'Blowing Dust / Windy' 'Widespread Dust'
 'Fair / Windy' 'Rain Showers' 'Mostly Cloudy / Windy'
 'Light Rain / Windy' 'Hail' 'Heavy Drizzle' 'Showers in the Vicinity'
 'Thunderstorm' 'Light Rain Shower' 'Light Rain with Thunder'
 'Partly Cloudy / Windy' 'Thunder in the Vicinity' 'T-Storm'
 'Heavy Thunderstorms and Rain' 'Thunder' 'Heavy T-Storm' 'Funnel Cloud'
 'Heavy T-Storm / Windy' 'Blowing Snow' 'Light Thunderstorms and Snow'
 'Heavy Snow' 'Low Drifting Snow' 'Light Ice Pellets' 'Ice Pellets'
 'Squalls' 'N/A Precipitation' 'Cloudy / Windy' 

In [19]:
accidents_df['Start_Time'] = pd.to_datetime(accidents_df['Start_Time'], errors='coerce')
accidents_df['Hour'] = accidents_df['Start_Time'].dt.hour

print("\nSample data with Hour Extracted from Start_Time:")
display(accidents_df[['Start_Time', 'Hour']].head())


Sample data with Hour Extracted from Start_Time:


Unnamed: 0,Start_Time,Hour
0,2016-02-08 05:46:00,5.0
1,2016-02-08 06:07:59,6.0
2,2016-02-08 06:49:27,6.0
3,2016-02-08 07:23:34,7.0
4,2016-02-08 07:39:07,7.0


In [20]:
# Convert End_Time to datetime
accidents_df['End_Time'] = pd.to_datetime(accidents_df['End_Time'], errors='coerce')

# Optionally extract hour from End_Time
accidents_df['Hour'] = accidents_df['End_Time'].dt.hour

print("\nSample data with End_Time converted and End_Hour extracted:")
display(accidents_df[['End_Time', 'Hour']].head())



Sample data with End_Time converted and End_Hour extracted:


Unnamed: 0,End_Time,Hour
0,2016-02-08 11:00:00,11.0
1,2016-02-08 06:37:59,6.0
2,2016-02-08 07:19:27,7.0
3,2016-02-08 07:53:34,7.0
4,2016-02-08 08:09:07,8.0
