<a href="https://colab.research.google.com/github/Salmakhaled204/nyc-collisions-w25/blob/Salma/Member_2_Data_Viz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid", context="notebook")


In [None]:
import pandas as pd

crashes_url = "https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=download"
persons_url = "https://data.cityofnewyork.us/api/views/f55k-p6yu/rows.csv?accessType=download"

df_crashes = pd.read_csv(crashes_url, low_memory=False)
df_persons = pd.read_csv(persons_url, low_memory=False)

df_crashes.head()
df_persons.head()

In [None]:
# MEMBER 2 — Deep EDA + Statistical Exploration
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

sns.set(style="whitegrid", context="notebook")

In [None]:
# 1. EXTRA TIME FEATURES
df_crashes["CRASH_MONTH"] = df_crashes["CRASH DATE"].dt.to_period("M").astype(str)

df_crashes["CRASH_HOUR"] = pd.to_datetime(
    df_crashes["CRASH TIME"], format="%H:%M", errors="coerce"
).dt.hour

In [None]:
# 2. CRASH PATTERNS OVER TIME (TIME-SERIES)
# Daily crashes
crashes_daily = (
    df_crashes
    .dropna(subset=["CRASH DATE"])
    .groupby("CRASH DATE")
    .size()
    .reset_index(name="CRASH_COUNT")
)

plt.figure(figsize=(14,5))
plt.plot(crashes_daily["CRASH DATE"], crashes_daily["CRASH_COUNT"])
plt.title("Daily Crash Counts Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Crashes")
plt.tight_layout()
plt.show()

# Monthly crashes
crashes_monthly = (
    df_crashes
    .groupby("CRASH_MONTH")
    .size()
    .reset_index(name="CRASH_COUNT")
    .sort_values("CRASH_MONTH")
)

plt.figure(figsize=(14,5))
plt.plot(crashes_monthly["CRASH_MONTH"], crashes_monthly["CRASH_COUNT"])
plt.title("Monthly Crash Counts")
plt.xlabel("Month")
plt.ylabel("Number of Crashes")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Crashes by hour of day
plt.figure(figsize=(10,4))
sns.countplot(x="CRASH_HOUR", data=df_crashes)
plt.title("Crash Distribution by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Crash Count")
plt.tight_layout()
plt.show()

In [None]:
# 3. CONTRIBUTING REASON FACTORS
factor_col = "CONTRIBUTING FACTOR VEHICLE 1"

top_factors = (
    df_crashes[factor_col]
    .fillna("Unspecified")
    .value_counts()
    .head(10)
)

plt.figure(figsize=(10,5))
sns.barplot(x=top_factors.values, y=top_factors.index)
plt.title("Top 10 Contributing Factors (Vehicle 1)")
plt.xlabel("Crash Count")
plt.ylabel("Contributing Factor")
plt.tight_layout()
plt.show()
# Grouped bar: top factors by borough (only top 5 factors)
top5_factors = top_factors.index[:5]

factor_borough = (
    df_crashes[df_crashes[factor_col].isin(top5_factors)]
    .groupby(["BOROUGH", factor_col])
    .size()
    .reset_index(name="COUNT")
    .dropna(subset=["BOROUGH"])
)

plt.figure(figsize=(12,6))
sns.barplot(
    data=factor_borough,
    x="BOROUGH",
    y="COUNT",
    hue=factor_col
)
plt.title("Top Contributing Factors by Borough")
plt.xlabel("Borough")
plt.ylabel("Crash Count")
plt.xticks(rotation=45)
plt.legend(title="Factor")
plt.tight_layout()
plt.show()


In [None]:
# 4. INJURIES & FATALITIES DISTRIBUTION
injury_cols = [
    "NUMBER OF PERSONS INJURED",
    "NUMBER OF PERSONS KILLED",
    "NUMBER OF PEDESTRIANS INJURED",
    "NUMBER OF PEDESTRIANS KILLED",
    "NUMBER OF CYCLIST INJURED",
    "NUMBER OF CYCLIST KILLED",
    "NUMBER OF MOTORIST INJURED",
    "NUMBER OF MOTORIST KILLED"
]

# Make sure numeric
for c in injury_cols:
    if c in df_crashes.columns:
        df_crashes[c] = pd.to_numeric(df_crashes[c], errors="coerce")

# Histogram – persons injured
plt.figure(figsize=(8,4))
df_crashes["NUMBER OF PERSONS INJURED"].dropna().hist(bins=40)
plt.title("Distribution of Number of Persons Injured per Crash")
plt.xlabel("Persons Injured")
plt.ylabel("Frequency")
plt.yscale("log")  # to see the long tail better
plt.tight_layout()
plt.show()

# Histogram – persons killed
plt.figure(figsize=(8,4))
df_crashes["NUMBER OF PERSONS KILLED"].dropna().hist(bins=40)
plt.title("Distribution of Number of Persons Killed per Crash")
plt.xlabel("Persons Killed")
plt.ylabel("Frequency")
plt.yscale("log")
plt.tight_layout()
plt.show()

# Kernel density plots: injured vs killed
plt.figure(figsize=(8,4))
sns.kdeplot(
    df_crashes["NUMBER OF PERSONS INJURED"].dropna(),
    label="Injured",
    fill=True
)
sns.kdeplot(
    df_crashes["NUMBER OF PERSONS KILLED"].dropna(),
    label="Killed",
    fill=True
)
plt.title("KDE – Injury vs Fatality Severity")
plt.xlabel("Count")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# 5. VEHICLE TYPES
veh_col = "VEHICLE TYPE CODE 1"

top_veh = (
    df_crashes[veh_col]
    .fillna("Unknown")
    .value_counts()
    .head(10)
)
plt.figure(figsize=(10,5))
sns.barplot(x=top_veh.values, y=top_veh.index)
plt.title("Top 10 Vehicle Types Involved in Crashes")
plt.xlabel("Crash Count")
plt.ylabel("Vehicle Type")
plt.tight_layout()
plt.show()
# Grouped bar: vehicle type vs BOROUGH (top 5 types)
top5_veh = top_veh.index[:5]

veh_borough = (
    df_crashes[df_crashes[veh_col].isin(top5_veh)]
    .groupby(["BOROUGH", veh_col])
    .size()
    .reset_index(name="COUNT")
    .dropna(subset=["BOROUGH"])
)

plt.figure(figsize=(12,6))
sns.barplot(
    data=veh_borough,
    x="BOROUGH",
    y="COUNT",
    hue=veh_col
)
plt.title("Top Vehicle Types by Borough")
plt.xlabel("Borough")
plt.ylabel("Crash Count")
plt.xticks(rotation=45)
plt.legend(title="Vehicle Type")
plt.tight_layout()
plt.show()

In [None]:
# 6. DEMOGRAPHIC PATTERNS (PERSONS TABLE)
# Age distribution
plt.figure(figsize=(10,4))
df_persons["PERSON_AGE"].dropna().hist(bins=40)
plt.title("Age Distribution of Persons in Crashes")
plt.xlabel("Age")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
# KDE by injury status (optional, filter reasonable ages)
persons_age = df_persons[
    (df_persons["PERSON_AGE"].between(0, 100)) &
    df_persons["PERSON_INJURY"].notna()
]

plt.figure(figsize=(10,4))
sns.kdeplot(
    data=persons_age,
    x="PERSON_AGE",
    hue="PERSON_INJURY",
    fill=True,
    common_norm=False
)
plt.title("Age Density by Injury Outcome")
plt.xlabel("Age")
plt.tight_layout()
plt.show()

# Gender distribution
plt.figure(figsize=(6,4))
sns.countplot(y="PERSON_SEX", data=df_persons)
plt.title("Gender Distribution of Persons in Crashes")
plt.ylabel("Gender")
plt.tight_layout()
plt.show()

# Person type (Driver/Passenger/Pedestrian)
plt.figure(figsize=(8,4))
sns.countplot(y="PERSON_TYPE", data=df_persons)
plt.title("Person Type Distribution")
plt.ylabel("Type")
plt.tight_layout()
plt.show()

In [None]:
# 7. CORRELATION HEATMAP (INJURIES & FATALITIES)
corr_cols = [c for c in injury_cols if c in df_crashes.columns]
corr = df_crashes[corr_cols].corr()

plt.figure(figsize=(10,6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap – Injuries & Fatalities")
plt.tight_layout()
plt.show()


# ⭐ Member 2 — Deep EDA Findings Summary

## 1. Crash Patterns Over Time
A time-series exploration of crash occurrences revealed clear temporal trends:

- **Daily trends:** Crash counts show consistent day-to-day fluctuations, with occasional spikes likely related to weather, traffic surges, or special events.
- **Monthly trends:** Certain months exhibit noticeably higher crash volumes, reflecting seasonal driving conditions or population activity patterns.
- **Hourly distribution:** Crashes peak during **morning rush hours (7–9 AM)** and **evening rush hours (4–7 PM)**. The lowest crash frequencies occur after midnight, when road activity is minimal.

These findings indicate strong temporal patterns that can help inform traffic control and public safety planning.

---

## 2. Contributing Factors (Reason Analysis)
Analysis of the *CONTRIBUTING FACTOR VEHICLE 1* field shows that a small set of reasons dominate overall crash causation:

- The most common causes include **driver inattention/distraction**, **following too closely**, **failure to yield**, and **improper passing or lane usage**.
- When disaggregated by borough, some factors appear more frequently in specific areas—for example, **driver inattention** is more prevalent in dense boroughs with heavy traffic.

This highlights how human behavior remains the primary root cause of most collisions.

---

## 3. Injuries & Fatalities Distribution
Exploring injury-related columns uncovered key severity patterns:

- Most crashes involve **zero to a small number of injuries**, producing a highly right-skewed distribution.
- Fatalities are very rare but significant—most fatal crashes involve **one death**, with very few high-fatality events.
- Kernel density curves show a clear difference between injury counts and fatality counts, with injuries distributed across a wider range.

Understanding these severity profiles helps prioritize high-impact safety interventions.

---

## 4. Vehicle Types
Analysis of the vehicle categories involved in crashes shows:

- The majority of collisions involve **passenger vehicles** such as sedans, SUVs, vans, and taxis.
- Trucks, buses, and motorcycles appear less frequently but exhibit distinct patterns in certain boroughs.
- Grouped bar charts reveal borough-specific differences (e.g., more taxis and buses in central/dense boroughs compared to more private vehicles in outer areas).

Vehicle-type concentration reflects underlying transportation characteristics in each borough.

---

## 5. Demographic Patterns (Persons Table)
Demographic exploration of the persons dataset uncovered:

- **Age distribution:** Despite the presence of outliers and inconsistent values, the overall pattern suggests that individuals aged **20–50 years** are most represented in crash involvement.
- **Gender distribution:** Males are more frequently involved than females across nearly all categories of crash involvement.
- **Person type:** Drivers make up the largest share, followed by passengers and then pedestrians, aligning with typical traffic exposure levels.

These demographic insights reveal which population segments are most affected by crashes.

---

## 6. Correlation Patterns
A correlation heatmap of injury-related variables shows:

- Strong positive correlations between **NUMBER OF PERSONS INJURED** and the sub-groups such as **PEDESTRIANS INJURED**, **CYCLISTS INJURED**, and **MOTORISTS INJURED**.
- Fatality-related variables show weaker correlations due to their rarity.

These correlations reinforce that multi-victim crash events tend to affect several road-user categories simultaneously.

---

## 7. Key Observations & Handover to Member 3
- Crash occurrences follow strong **time-based patterns**, heavily influenced by commuting behavior.
- **Human factors**, such as distraction and failure to yield, dominate crash causation.
- Injury severity is highly skewed, with most crashes being minor but a small number being extremely severe.
- Vehicle involvement patterns align with borough-level transportation characteristics.
- Demographic patterns show that working-age adults and males are the most frequently affected groups.
- Raw demographic fields (especially **PERSON_AGE**) contain outliers and inconsistent values.  
  These data-quality issues are explicitly handed off to **Member 3**, whose role is to perform
  systematic cleaning, outlier treatment, and standardization before integration.