1) DATA DESCRIPTION

-> lOADING & CLEANING

In [18]:
import requests
import zipfile
import os

# URL for the FARS 2022 Accident-Level CSV zip file
url = "https://static.nhtsa.gov/nhtsa/downloads/FARS/2022/National/FARS2022NationalCSV.zip"

# File paths
zip_filename = "FARS2022.zip"
extract_path = "FARS2022_data"

# Step 1: Download the ZIP file
print("Downloading FARS 2022 dataset...")
response = requests.get(url)
with open(zip_filename, "wb") as f:
    f.write(response.content)
print("Download complete.")

# Step 2: Extract ZIP contents
print("Extracting files...")
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("Extraction complete.")

# Step 3: List extracted files
print("Extracted files:")
print(os.listdir(extract_path))


Downloading FARS 2022 dataset...
Download complete.
Extracting files...
Extraction complete.
Extracted files:
['FARS2022NationalCSV']


In [19]:
import pandas as pd

# Load the main accident file
df = pd.read_csv("accident.csv")

# Preview the data
df.head()

Unnamed: 0,STATE,STATENAME,ST_CASE,PEDS,PERNOTMVIT,VE_TOTAL,VE_FORMS,PVH_INVL,PERSONS,PERMVIT,...,NOT_MINNAME,ARR_HOUR,ARR_HOURNAME,ARR_MIN,ARR_MINNAME,HOSP_HR,HOSP_HRNAME,HOSP_MN,HOSP_MNNAME,FATALS
0,1,Alabama,10001,0,0,2,2,0,3,3,...,47,13,1:00pm-1:59pm,4,4,13,1:00pm-1:59pm,47,47,1
1,1,Alabama,10002,0,0,2,2,0,5,5,...,Unknown,99,Unknown EMS Scene Arrival Hour,99,Unknown EMS Scene Arrival Minutes,99,Unknown,99,Unknown EMS Hospital Arrival Time,2
2,1,Alabama,10003,0,0,1,1,0,2,2,...,33,1,1:00am-1:59am,50,50,99,Unknown,99,Unknown EMS Hospital Arrival Time,1
3,1,Alabama,10004,0,0,1,1,0,1,1,...,48,15,3:00pm-3:59pm,9,9,15,3:00pm-3:59pm,44,44,1
4,1,Alabama,10005,1,1,1,1,0,1,1,...,48,18,6:00pm-6:59pm,54,54,88,Not Applicable (Not Transported),88,Not Applicable (Not Transported),1


In [20]:
# Get dimensions
print("Dataset dimensions (rows, columns):", df.shape)

Dataset dimensions (rows, columns): (39422, 80)


CLEANING STEPS

In [21]:
#Cleaning
# Step 1: Remove columns with all missing values
df_cleaned = df.dropna(axis=1, how='all')

# Step 2: Drop rows with missing latitude or longitude (required for mapping)
df_cleaned = df_cleaned.dropna(subset=['LATITUDE', 'LONGITUD'])

# Step 3: Convert LATITUDE and LONGITUD to numeric, coerce any issues
df_cleaned['LATITUDE'] = pd.to_numeric(df_cleaned['LATITUDE'], errors='coerce')
df_cleaned['LONGITUD'] = pd.to_numeric(df_cleaned['LONGITUD'], errors='coerce')

# Step 4: Drop rows with invalid lat/lon after conversion
df_cleaned = df_cleaned.dropna(subset=['LATITUDE', 'LONGITUD'])

# Step 5: Filter out obviously incorrect coordinate values
df_cleaned = df_cleaned[
    (df_cleaned['LATITUDE'].between(-90, 90)) &
    (df_cleaned['LONGITUD'].between(-180, 180))
]

#cleaned dimensions
print("Original shape:", df.shape)
print("Cleaned shape:", df_cleaned.shape)

# Save cleaned dataset
df_cleaned.to_csv("accident_cleaned.csv", index=False)


Original shape: (39422, 80)
Cleaned shape: (39208, 80)


2) MAP VISUALIZATION

BUBBLE MAP

In [22]:
import plotly.express as px

# Load cleaned dataset
df = pd.read_csv("accident_cleaned.csv")

# Sample to avoid overloading the map
df_sample = df.sample(n=5000, random_state=42)

# Optional: decode day of week and weather codes
day_map = {
    1: "Sunday", 2: "Monday", 3: "Tuesday", 4: "Wednesday",
    5: "Thursday", 6: "Friday", 7: "Saturday"
}

weather_map = {
    1: "Clear", 2: "Rain", 3: "Sleet", 4: "Snow", 5: "Fog",
    6: "Severe Crosswinds", 7: "Blowing Sand/Snow", 8: "Other"
}

df_sample['DAY_NAME'] = df_sample['DAY_WEEK'].map(day_map)
df_sample['WEATHER_COND'] = df_sample['WEATHER'].map(weather_map)

# Create bubble map
fig = px.scatter_mapbox(
    df_sample,
    lat="LATITUDE",
    lon="LONGITUD",
    hover_name="STATE",
    hover_data={
        "FATALS": True,
        "HOUR": True,
        "DAY_NAME": True,
        "WEATHER_COND": True,
        "LATITUDE": False,
        "LONGITUD": False
    },
    color_discrete_sequence=["crimson"],
    zoom=3,
    height=600
)

# Styling
fig.update_layout(
    mapbox_style="open-street-map",
    title=" Fatal Crashes Across the U.S. (2022) â€“ Sample of 5,000",
    margin={"r":0,"t":40,"l":0,"b":0}
)

fig.show()


choropleth Map

In [23]:

# Group by state and count the number of crashes
state_counts = df['STATE'].value_counts().reset_index()
state_counts.columns = ['STATE', 'NUM_CRASHES']

# FARS uses numeric state codes, we need to map them to state abbreviations
state_code_map = {
    1: "AL", 2: "AK", 4: "AZ", 5: "AR", 6: "CA", 8: "CO", 9: "CT", 10: "DE", 11: "DC",
    12: "FL", 13: "GA", 15: "HI", 16: "ID", 17: "IL", 18: "IN", 19: "IA", 20: "KS",
    21: "KY", 22: "LA", 23: "ME", 24: "MD", 25: "MA", 26: "MI", 27: "MN", 28: "MS",
    29: "MO", 30: "MT", 31: "NE", 32: "NV", 33: "NH", 34: "NJ", 35: "NM", 36: "NY",
    37: "NC", 38: "ND", 39: "OH", 40: "OK", 41: "OR", 42: "PA", 44: "RI", 45: "SC",
    46: "SD", 47: "TN", 48: "TX", 49: "UT", 50: "VT", 51: "VA", 53: "WA", 54: "WV",
    55: "WI", 56: "WY"
}

# Add state abbreviations
state_counts['STATE_ABBR'] = state_counts['STATE'].map(state_code_map)

# Remove any rows without valid state abbreviation
state_counts = state_counts.dropna(subset=['STATE_ABBR'])

# Plot choropleth
fig = px.choropleth(
    state_counts,
    locations="STATE_ABBR",
    locationmode="USA-states",
    color="NUM_CRASHES",
    color_continuous_scale="Reds",
    scope="usa",
    title="Number of Fatal Crashes by U.S. State (2022)"
)

fig.update_layout(margin={"r":0,"t":50,"l":0,"b":0})
fig.show()


In [24]:

# Group by STATE and calculate:
state_summary = df.groupby("STATE").agg(
    NUM_CRASHES=('STATE', 'count'),
    TOTAL_FATALS=('FATALS', 'sum')
).reset_index()

# Map FARS state codes to abbreviations
state_code_map = {
    1: "AL", 2: "AK", 4: "AZ", 5: "AR", 6: "CA", 8: "CO", 9: "CT", 10: "DE", 11: "DC",
    12: "FL", 13: "GA", 15: "HI", 16: "ID", 17: "IL", 18: "IN", 19: "IA", 20: "KS",
    21: "KY", 22: "LA", 23: "ME", 24: "MD", 25: "MA", 26: "MI", 27: "MN", 28: "MS",
    29: "MO", 30: "MT", 31: "NE", 32: "NV", 33: "NH", 34: "NJ", 35: "NM", 36: "NY",
    37: "NC", 38: "ND", 39: "OH", 40: "OK", 41: "OR", 42: "PA", 44: "RI", 45: "SC",
    46: "SD", 47: "TN", 48: "TX", 49: "UT", 50: "VT", 51: "VA", 53: "WA", 54: "WV",
    55: "WI", 56: "WY"
}
state_summary['STATE_ABBR'] = state_summary['STATE'].map(state_code_map)

# State population estimates (2022)
state_pop = {
    "AL": 5074296, "AK": 733583, "AZ": 7359191, "AR": 3025891, "CA": 38801906,
    "CO": 5914528, "CT": 3605597, "DE": 1018396, "DC": 671803, "FL": 22377810,
    "GA": 11021643, "HI": 1449324, "ID": 1952306, "IL": 12582032, "IN": 6847569,
    "IA": 3193075, "KS": 2934582, "KY": 4512310, "LA": 4612217, "ME": 1395385,
    "MD": 6210510, "MA": 7051936, "MI": 10050811, "MN": 5774175, "MS": 2949965,
    "MO": 6219201, "MT": 1138820, "NE": 1961504, "NV": 3238726, "NH": 1395231,
    "NJ": 9267130, "NM": 2115877, "NY": 19376771, "NC": 10811367, "ND": 783926,
    "OH": 11780017, "OK": 4033364, "OR": 4246155, "PA": 12820889, "RI": 1094921,
    "SC": 5424064, "SD": 919318, "TN": 7159392, "TX": 30874007, "UT": 3432847,
    "VT": 647064, "VA": 8899534, "WA": 7911971, "WV": 1775156, "WI": 5892539,
    "WY": 586485
}
state_summary['POP_2022'] = state_summary['STATE_ABBR'].map(state_pop)

# Normalize values
state_summary['CRASHES_PER_100K'] = (state_summary['NUM_CRASHES'] / state_summary['POP_2022']) * 100000
state_summary['FATALS_PER_100K'] = (state_summary['TOTAL_FATALS'] / state_summary['POP_2022']) * 100000

# Remove any rows with missing population/state code
state_summary = state_summary.dropna(subset=['STATE_ABBR'])

# Choropleth 1: Crashes per 100K
fig1 = px.choropleth(
    state_summary,
    locations="STATE_ABBR",
    locationmode="USA-states",
    color="CRASHES_PER_100K",
    color_continuous_scale="Oranges",
    scope="usa",
    title=" Fatal Crashes per 100K Population by U.S. State (2022)"
)

# Choropleth 2: Fatalities per 100K
fig2 = px.choropleth(
    state_summary,
    locations="STATE_ABBR",
    locationmode="USA-states",
    color="FATALS_PER_100K",
    color_continuous_scale="Reds",
    scope="usa",
    title="Total Fatalities per 100K Population by U.S. State (2022)"
)

# Show both maps
fig1.show()
fig2.show()


2) AGGREGATION VISUALIZATIONS

Histogram

In [25]:
# Remap day of week to proper labels
day_map = {
    1: "Sunday", 2: "Monday", 3: "Tuesday", 4: "Wednesday",
    5: "Thursday", 6: "Friday", 7: "Saturday"
}

# Map to a new column if not already there
df['DAY_NAME'] = df['DAY_WEEK'].map(day_map)

# Drop rows where HOUR or DAY_NAME is missing
df_hist = df.dropna(subset=['HOUR', 'DAY_NAME'])


In [26]:

# Clean up: keep valid hours only
df_clean_hour = df.dropna(subset=['HOUR', 'DAY_NAME'])
df_clean_hour = df_clean_hour[df_clean_hour['HOUR'].between(0, 23)]

# Ensure 'HOUR' is integer for proper binning
df_clean_hour['HOUR'] = df_clean_hour['HOUR'].astype(int)

# Replot histogram
fig = px.histogram(
    df_clean_hour,
    x="HOUR",
    color="DAY_NAME",
    nbins=24,
    title=" Hourly Distribution of Fatal Crashes by Day of Week (Valid Hours Only)",
    labels={"HOUR": "Hour of Day", "count": "Number of Crashes"},
    barmode="group"
)

fig.update_layout(
    xaxis=dict(dtick=1),
    bargap=0.05
)
fig.show()


In [27]:

# Step 1: Clean the HOUR and DAY_NAME columns (if not already done)
df_filtered = df.dropna(subset=['HOUR', 'DAY_WEEK'])
df_filtered = df_filtered[df_filtered['HOUR'].between(0, 23)]
df_filtered['HOUR'] = df_filtered['HOUR'].astype(int)

# Step 2: Map day names
day_map = {
    1: "Sunday", 2: "Monday", 3: "Tuesday", 4: "Wednesday",
    5: "Thursday", 6: "Friday", 7: "Saturday"
}
df_filtered['DAY_NAME'] = df_filtered['DAY_WEEK'].map(day_map)

# Step 3: Add new column: Weekday vs Weekend
df_filtered['DAY_TYPE'] = df_filtered['DAY_NAME'].apply(
    lambda x: 'Weekend' if x in ['Saturday', 'Sunday'] else 'Weekday'
)

# Step 4: Plot interactive histogram by day type
fig = px.histogram(
    df_filtered,
    x="HOUR",
    color="DAY_TYPE",
    nbins=24,
    title="Fatal Crashes by Hour: Weekday vs Weekend",
    labels={"HOUR": "Hour of Day", "count": "Number of Crashes"},
    color_discrete_map={
        "Weekday": "#636EFA",  # blue
        "Weekend": "#EF553B"   # orange-red
    },
    barmode="group"
)

fig.update_layout(
    xaxis=dict(dtick=1),
    bargap=0.05
)
fig.show()


Boxplot

In [28]:
# Map weather codes to readable labels
weather_map = {
    1: "Clear", 2: "Rain", 3: "Sleet", 4: "Snow", 5: "Fog",
    6: "Severe Crosswinds", 7: "Blowing Sand/Snow", 8: "Other"
}
df['WEATHER_LABEL'] = df['WEATHER'].map(weather_map)

# Drop missing or unknown weather labels
df_weather = df.dropna(subset=['WEATHER_LABEL', 'FATALS'])

# Optional: sort by count of records per weather condition
weather_order = df_weather['WEATHER_LABEL'].value_counts().index.tolist()

# Custom colors for weather types
color_map = {
    "Clear": "#636EFA", "Rain": "#EF553B", "Fog": "#00CC96",
    "Snow": "#AB63FA", "Sleet": "#FFA15A", "Other": "#19D3F3",
    "Severe Crosswinds": "#FF6692", "Blowing Sand/Snow": "#B6E880"
}

# Plot: Boxplot (not strip) for fatalities by weather
fig = px.box(
    df_weather,
    x="WEATHER_LABEL",
    y="FATALS",
    color="WEATHER_LABEL",
    category_orders={"WEATHER_LABEL": weather_order},
    title=" Distribution of Fatalities per Crash by Weather Condition",
    labels={"FATALS": "Number of Fatalities", "WEATHER_LABEL": "Weather"},
    color_discrete_map=color_map,
    points="all"  # Show dots too
)

fig.update_layout(
    xaxis_tickangle=-30,
    showlegend=False,
    yaxis=dict(tickformat="d")
)

fig.show()


3)INTERACTIVE VISUALIZATIONS

Heat map with hover interactions

In [29]:
# Sample to avoid lag (or use entire set if performance allows)
df_heat = df[['LATITUDE', 'LONGITUD']].dropna()
df_heat = df_heat[
    (df_heat['LATITUDE'].between(24, 50)) & (df_heat['LONGITUD'].between(-125, -65))
]

# Sample for performance
df_sample = df_heat.sample(n=5000, random_state=42)

# Create improved density heatmap
fig = px.density_mapbox(
    df_sample,
    lat="LATITUDE",
    lon="LONGITUD",
    radius=6,  # smaller radius = sharper hotspots
    center={"lat": 37.5, "lon": -95},  # center over continental US
    zoom=3.8,
    mapbox_style="carto-positron",
    title=" Refined Heatmap of Fatal Crash Hotspots in the U.S. (Sample of 5,000)"
)

fig.update_layout(margin={"r":0,"t":50,"l":0,"b":0})
fig.show()



In [30]:
!pip install geopandas




Time-series visualizations with scrubbing

In [31]:
import plotly.express as px

# Group crashes by hour
hourly = df[df['HOUR'].between(0, 23)].groupby('HOUR').size().reset_index(name='crashes')

# Plot interactive time-like series
fig = px.line(
    hourly,
    x="HOUR",
    y="crashes",
    title="Time-Series of Fatal Crashes by Hour of Day",
    labels={"HOUR": "Hour of Day", "crashes": "Number of Crashes"}
)

fig.update_layout(
    xaxis=dict(
        rangeslider=dict(visible=True),  # enable scrubbing
        dtick=1
    )
)

fig.show()


Scatter plot

In [32]:
import plotly.express as px

# Drop missing
scatter_df = df.dropna(subset=["HOUR", "FATALS", "WEATHER"])
scatter_df = scatter_df[scatter_df["HOUR"].between(0, 23)]

# Map weather codes
weather_map = {
    1: "Clear", 2: "Rain", 3: "Sleet", 4: "Snow", 5: "Fog",
    6: "Severe Crosswinds", 7: "Blowing Sand/Snow", 8: "Other"
}
scatter_df["WEATHER_LABEL"] = scatter_df["WEATHER"].map(weather_map)

# Create scatter plot
fig = px.scatter(
    scatter_df.sample(n=3000, random_state=42),
    x="HOUR",
    y="FATALS",
    color="WEATHER_LABEL",
    hover_data=["STATE", "DAY_WEEK"],
    title="Fatalities by Hour Colored by Weather Condition",
    labels={"HOUR": "Hour of Day", "FATALS": "Number of Fatalities", "WEATHER_LABEL": "Weather"}
)

fig.show()


Sample for performance (Interactive map)

In [33]:
import plotly.express as px
import pandas as pd

# Prepare data
df_map = df.dropna(subset=["LATITUDE", "LONGITUD", "FATALS", "WEATHER"])
df_map = df_map[df_map["LATITUDE"].between(-90, 90) & df_map["LONGITUD"].between(-180, 180)]

# Map weather codes to labels
weather_map = {
    1: "Clear", 2: "Rain", 3: "Sleet", 4: "Snow", 5: "Fog",
    6: "Severe Crosswinds", 7: "Blowing Sand/Snow", 8: "Other"
}
df_map["WEATHER_LABEL"] = df_map["WEATHER"].map(weather_map)

# Sample for performance
df_sample = df_map.sample(n=3000, random_state=42)

# Create map
fig = px.scatter_mapbox(
    df_sample,
    lat="LATITUDE",
    lon="LONGITUD",
    color="WEATHER_LABEL",
    size="FATALS",
    size_max=10,
    zoom=3.5,
    height=600,
    mapbox_style="open-street-map",
    hover_name="WEATHER_LABEL",
    hover_data={"FATALS": True, "HOUR": True, "DAY_WEEK": True, "LATITUDE": False, "LONGITUD": False},
    title="Interactive Map of Fatal Crashes by Weather Condition (Sample of 3,000)"
)

fig.show()


Bar chart

In [34]:


# Group the data
df_bar = df.dropna(subset=["DAY_NAME", "WEATHER_LABEL"])
bar_data = df_bar.groupby(["DAY_NAME", "WEATHER_LABEL"]).size().reset_index(name="Count")

# Limit to top 4 most frequent weather types
top_weather = df["WEATHER_LABEL"].value_counts().nlargest(4).index.tolist()
bar_data_filtered = bar_data[bar_data["WEATHER_LABEL"].isin(top_weather)]

# Ensure correct day order
day_order = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
bar_data_filtered["DAY_NAME"] = pd.Categorical(bar_data_filtered["DAY_NAME"], categories=day_order, ordered=True)

# Plot grouped bar chart
fig = px.bar(
    bar_data_filtered,
    x="DAY_NAME",
    y="Count",
    color="WEATHER_LABEL",
    barmode="group",
    title="Fatal Crashes by Day of Week (Top 4 Weather Conditions)",
    labels={"DAY_NAME": "Day", "Count": "Number of Crashes", "WEATHER_LABEL": "Weather"},
    height=500
)

fig.update_layout(xaxis_tickangle=-30)
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

