<a href="https://colab.research.google.com/github/SaquibKhan-DS/311-Customer-Service-Optimization/blob/main/notebooks/02_exploratory_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 02_exploratory_data_analysis.ipynb

# -----------------------------
# Notebook 02: Exploratory Data Analysis
# -----------------------------
# This notebook focuses on analyzing complaint patterns,
# identifying top complaint types, and examining city-level distributions.
# -----------------------------

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# -----------------------------
# Step 1: Load dataset
# -----------------------------
df = pd.read_csv('/kaggle/input/311-service-requests-nyc/311_Service_Requests_from_2010_to_Present.csv')

# Convert dates for later use
df['created_dt'] = pd.to_datetime(df['Created Date'], errors='coerce')
df['closed_dt'] = pd.to_datetime(df['Closed Date'], errors='coerce')

# Keep only rows where 'Closed Date' exists
df = df[df['Closed Date'].notna()]

# Fill missing city names
df['City'] = df.apply(lambda x: 'Unknown City' if pd.isnull(x['City']) else x['City'], axis=1)

In [None]:
# -----------------------------
# Step 2: Filter specific city for concentration study (Brooklyn example)
# -----------------------------
df_brooklyn = df.loc[df['City'] == 'BROOKLYN']

# Scatter plot of complaints in Brooklyn by location
df_brooklyn[['Latitude','Longitude']].plot(
    kind='scatter',
    figsize=(15,10),
    x='Longitude',
    y='Latitude',
    title='Complaint Concentration Across BROOKLYN'
)
plt.xlabel('Latitude in Degrees')
plt.ylabel('Longitude in Degrees')

# Hexbin plot of complaints in Brooklyn
df_brooklyn[['Latitude','Longitude']].plot(
    kind='hexbin',
    figsize=(15,10),
    colormap='Paired',
    mincnt=1,
    x='Longitude',
    y='Latitude',
    gridsize=25,
    title='Complaint Concentration Across BROOKLYN'
)
plt.xlabel('Latitude in Degrees')
plt.ylabel('Longitude in Degrees')

In [None]:
# -----------------------------
# Step 3: Complaint type analysis
# -----------------------------
df['Complaint Type'].unique()       # List of unique complaint types
df['Complaint Type'].value_counts() # Frequency of each complaint type

# Plot overall complaint type distribution
df['Complaint Type'].value_counts().plot(kind='bar', figsize=(20, 10))
plt.title('Complaint Type VS Count')
plt.xlabel("Complaint Type")
plt.ylabel("Count")

In [None]:
# -----------------------------
# Step 4: City-specific complaint analysis
# -----------------------------
df_ny = df.loc[df['City'] == "NEW YORK"]

# Count and plot complaints in New York
df_ny['Complaint Type'].value_counts().plot(kind='bar', figsize=(20,10))
plt.title('Count by Complaint Type for New York')
plt.xlabel('Complaint Type')
plt.ylabel('Count')
plt.show()

# Top 10 complaint types overall
df_top10_ctypes = df.groupby(['Complaint Type']).size().nlargest(10)
df_top10_ctypes

In [None]:
# -----------------------------
# Step 5: Complaint type plots for each city
# -----------------------------
city_list = df['City'].unique()
for c in city_list:
    df_c = df.loc[df['City'] == c]
    df_c['Complaint Type'].value_counts().plot(kind='bar', figsize=(20,10))
    plt.title(f'Count by Complaint Type for {c}')
    plt.xlabel('Complaint Type')
    plt.ylabel('Count')
    plt.show()
    print('================================================')

In [None]:
# -----------------------------
# Step 6: Stacked bar chart of complaints per city
# -----------------------------
df_complainttypes = df.groupby(['City','Complaint Type']).size().unstack().fillna(0)
df_complainttypes.plot(kind='bar', figsize=(20,10), stacked=True, colormap='Paired')
plt.ylabel('Number of Complaints')
plt.title('Number of Complaints vs. City')