In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load the dataset
dataset_path = "C:/Users/win10/Desktop/US_Accidents_March23.csv"
accidents_df = pd.read_csv(dataset_path)

### Univariate Analysis
**Definition:**

Univariate analysis examines and describes a single variable or feature in a dataset. The goal is to understand the distribution, central tendency (mean, median, mode), spread (variance, standard deviation), and shape (skewness, kurtosis) of that variable. Common tools include frequency counts, histograms, bar charts, and boxplots.

In [None]:
# --- UNIVARIATE ANALYSIS ---
# Distribution of 'Severity' (categorical)
sns.countplot(x='Severity', data=accidents_df, palette='pastel')
plt.title('Univariate: Accident Severity Distribution')
plt.show()

### Bivariate Analysis
**Definition:**

Bivariate analysis explores the relationship between exactly two variables (X and Y) to discover whether they are related and, if so, how. It helps analyze the nature, strength, and direction of the association—using methods such as correlation, scatter plots, cross-tabulation, boxplots, and regression. Examples include height vs. weight, or accident severity vs. weather condition.

In [None]:
# --- BIVARIATE ANALYSIS ---
# Relationship between Severity and Weather_Condition (categorical vs. categorical)
top_weather = accidents_df['Weather_Condition'].value_counts().nlargest(8)
sns.boxplot(x='Weather_Condition', y='Severity', 
            data=accidents_df[accidents_df['Weather_Condition'].isin(top_weather.index)])
plt.title('Bivariate: Severity by Top Weather Conditions')
plt.xticks(rotation=30)
plt.show()

### Multivariate Analysis
**Definition:**

Multivariate analysis studies three or more variables simultaneously to uncover complex relationships, patterns, and interactions among them. It can show how multiple factors together influence an outcome—using techniques like heatmaps, pairplots, multiple regression, and principal component analysis. Examples include examining how weather, hour, and traffic conditions together affect accident severity.

In [None]:
# --- MULTIVARIATE ANALYSIS ---
# Severity by Hour and Top Weather Conditions (heatmap)
if 'Hour' not in accidents_df:
    accidents_df['Hour'] = pd.to_datetime(accidents_df['Start_Time'], errors='coerce').dt.hour
pivot = accidents_df[accidents_df['Weather_Condition'].isin(top_weather.index)].pivot_table(
    values='Severity', index='Hour', columns='Weather_Condition', aggfunc='mean')
plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap='YlGnBu')
plt.title('Multivariate: Mean Severity by Hour and Weather Condition')
plt.ylabel('Hour')
plt.xlabel('Weather Condition')
plt.show()

### Basic Statistics

In [None]:
# --- Summary statistics for all numerical columns ---
print("Summary statistics for numerical columns:")
display(accidents_df.describe().T)

In [None]:
# --- Additional statistics: median for each numerical column ---
print("\nMedian values for numerical columns:")
display(accidents_df.median(numeric_only=True))

In [None]:
# --- Explore distributions and ranges for key features ---

# 1. Severity
print("\nSeverity value counts:")
print(accidents_df['Severity'].value_counts().sort_index())

plt.figure(figsize=(7, 4))
sns.countplot(x='Severity', data=accidents_df, palette='Blues')
plt.title('Distribution of Accident Severity')
plt.show()

In [None]:
# 2. Temperature
plt.figure(figsize=(7, 4))
sns.histplot(accidents_df['Temperature(F)'].dropna(), kde=True, color='coral')
plt.title('Distribution of Temperature (F)')
plt.xlabel('Temperature (F)')
plt.show()
print("Temperature (F): min =", accidents_df['Temperature(F)'].min(), 
      "| max =", accidents_df['Temperature(F)'].max())

In [None]:
# 3. Visibility
plt.figure(figsize=(7, 4))
sns.histplot(accidents_df['Visibility(mi)'].dropna(), kde=True, color='purple')
plt.title('Distribution of Visibility (mi)')
plt.xlabel('Visibility (mi)')
plt.show()
print("Visibility (mi): min =", accidents_df['Visibility(mi)'].min(),
      "| max =", accidents_df['Visibility(mi)'].max())

In [None]:
# 4. Distance
plt.figure(figsize=(7, 4))
# Use log scale due to highly skewed distance distribution
sns.histplot(accidents_df['Distance(mi)'].dropna(), kde=True, color='orange', log_scale=True)
plt.title('Distribution of Accident Distance (mi) [Log Scale]')
plt.xlabel('Distance (mi)')
plt.show()
print("Distance (mi): min =", accidents_df['Distance(mi)'].min(),
      "| max =", accidents_df['Distance(mi)'].max())

In [None]:
# 5. Explore Weather Condition categories
print("\nMost frequent Weather Conditions:")
print(accidents_df['Weather_Condition'].value_counts(dropna=True).head(10))

plt.figure(figsize=(10, 4))
sns.countplot(y='Weather_Condition', data=accidents_df,
              order=accidents_df['Weather_Condition'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Weather Conditions in Accidents')
plt.show()

In [None]:
# 6. Explore distribution over latitude and longitude (range)
print("\nLatitude range:", accidents_df['Start_Lat'].min(), "to", accidents_df['Start_Lat'].max())
print("Longitude range:", accidents_df['Start_Lng'].min(), "to", accidents_df['Start_Lng'].max())