In [1]:

from datasets import load_dataset

# Stream the dataset (doesn’t save to disk!)
dataset = load_dataset("yuvidhepe/us-accidents-updated", split="train", streaming=True)

# Preview a few rows
for row in dataset.take(5):
    print(row)


{'ID': 'A-1', 'Source': 'Source2', 'Severity': 3, 'Start_Time': '2016-02-08 05:46:00', 'End_Time': '2016-02-08 11:00:00', 'Start_Lat': 39.865147, 'Start_Lng': -84.058723, 'End_Lat': None, 'End_Lng': None, 'Distance(mi)': 0.01, 'Description': 'Right lane blocked due to accident on I-70 Eastbound at Exit 41 OH-235 State Route 4.', 'Street': 'I-70 E', 'City': 'Dayton', 'County': 'Montgomery', 'State': 'OH', 'Zipcode': '45424', 'Country': 'US', 'Timezone': 'US/Eastern', 'Airport_Code': 'KFFO', 'Weather_Timestamp': '2016-02-08 05:58:00', 'Temperature(F)': 36.9, 'Wind_Chill(F)': None, 'Humidity(%)': 91.0, 'Pressure(in)': 29.68, 'Visibility(mi)': 10.0, 'Wind_Direction': 'Calm', 'Wind_Speed(mph)': None, 'Precipitation(in)': 0.02, 'Weather_Condition': 'Light Rain', 'Amenity': False, 'Bump': False, 'Crossing': False, 'Give_Way': False, 'Junction': False, 'No_Exit': False, 'Railway': False, 'Roundabout': False, 'Station': False, 'Stop': False, 'Traffic_Calming': False, 'Traffic_Signal': False, 'T

In [2]:
import pandas as pd

# Load first 1000 rows into a DataFrame
streamed_data = dataset.take(1000)
df = pd.DataFrame(list(streamed_data))

df


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A-996,Source2,2,2016-06-23 10:31:07,2016-06-23 11:01:07,38.022778,-121.965698,,,0.00,...,False,False,False,False,False,False,Day,Day,Day,Day
996,A-997,Source2,3,2016-06-23 10:22:09,2016-06-23 10:52:09,37.656654,-121.901588,,,0.00,...,False,False,False,False,False,False,Day,Day,Day,Day
997,A-998,Source2,2,2016-06-23 10:31:51,2016-06-23 11:46:51,38.690273,-121.392136,,,0.00,...,False,False,False,False,True,False,Day,Day,Day,Day
998,A-999,Source2,2,2016-06-23 10:32:51,2016-06-23 11:47:51,38.681110,-121.333244,,,0.00,...,False,False,False,False,True,False,Day,Day,Day,Day


# ✅ Task 3: Check for Missing or Dirty Data
Find which columns have missing values.

Decide which ones to remove (if mostly empty) or fix (by filling or dropping rows).

Focus on cleaning important columns like:

Time

Location

Severity

Weather


In [3]:
df.isnull().sum()

ID                          0
Source                      0
Severity                    0
Start_Time                  0
End_Time                    0
Start_Lat                   0
Start_Lng                   0
End_Lat                  1000
End_Lng                  1000
Distance(mi)                0
Description                 0
Street                      0
City                        0
County                      0
State                       0
Zipcode                     0
Country                     0
Timezone                    0
Airport_Code                0
Weather_Timestamp           1
Temperature(F)              1
Wind_Chill(F)             540
Humidity(%)                 1
Pressure(in)                1
Visibility(mi)              3
Wind_Direction              1
Wind_Speed(mph)            47
Precipitation(in)         813
Weather_Condition           2
Amenity                     0
Bump                        0
Crossing                    0
Give_Way                    0
Junction  

In [4]:
df.drop(columns=["End_Lat"],inplace=True )
df.drop(columns=["End_Lng"],inplace=True )


In [5]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [6]:
df.fillna('Unknown', inplace=True)

In [7]:
df.isnull().sum()

ID                       0
Source                   0
Severity                 0
Start_Time               0
End_Time                 0
Start_Lat                0
Start_Lng                0
Distance(mi)             0
Description              0
Street                   0
City                     0
County                   0
State                    0
Zipcode                  0
Country                  0
Timezone                 0
Airport_Code             0
Weather_Timestamp        0
Temperature(F)           0
Wind_Chill(F)            0
Humidity(%)              0
Pressure(in)             0
Visibility(mi)           0
Wind_Direction           0
Wind_Speed(mph)          0
Precipitation(in)        0
Weather_Condition        0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
T

# ✅ Task 4: Extract Useful Time Information
From the accident time column, extract:

Hour of the day

Day of the week

Month

Year

This will help in time-based analysis (e.g., when accidents happen most).


In [8]:

# Convert Start_Time to datetime
df['Start_Time'] = pd.to_datetime(df['Start_Time'])

# Extract the hour
df['Hour'] = df['Start_Time'].dt.hour

# Count accidents per hour
accidents_per_hour = df['Hour'].value_counts()

# Find hour with maximum accidents
peak_hour = accidents_per_hour.idxmax() # time (id)
peak_count = accidents_per_hour.max()   # value 

# Show result
print(f"The hour with the most accidents is: {peak_hour}:00")
print(f"Number of accidents during this hour: {peak_count}")


The hour with the most accidents is: 7:00
Number of accidents during this hour: 145


In [9]:

# Extract ISO week number (recommended method)
df['Week_Number'] = df['Start_Time'].dt.isocalendar().day

# Count accidents per week, sorted by week number
accidents_per_week = df['Week_Number'].value_counts().sort_index()

# Display the counts
print(accidents_per_week)


Week_Number
1    153
2    250
3    259
4    204
5    116
6     11
7      7
Name: count, dtype: Int64


In [10]:
# Extract month number (1 to 12)
df['Month'] = df['Start_Time'].dt.month

# Or extract month name (January, February, etc.)
# df['Month'] = df['Start_Time'].dt.month_name()

# Count accidents per month sorted by month number
accidents_per_month = df['Month'].value_counts().sort_index()

# Display the counts
print(accidents_per_month)


Month
2    425
3    303
6    272
Name: count, dtype: int64


In [11]:

# Extract ISO week number (recommended method)
df['Week_Number'] = df['Start_Time'].dt.isocalendar().year

# Count accidents per week, sorted by week number
accidents_per_week = df['Week_Number'].value_counts().sort_index()

# Display the counts
print(accidents_per_week)

Week_Number
2016    1000
Name: count, dtype: Int64
