# Chicago Crime Data Exploration

This notebook explores the theft crime data from the Chicago Data Portal, focusing on pocket-picking and purse-snatching incidents.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add project root to path for imports
sys.path.append(os.path.abspath('..'))

from src.data.data_loader import ChicagoCrimeDataLoader

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams.update({'font.size': 12})

## Load the Data

First, we'll load the theft crime data from the Chicago Data Portal.

In [None]:
# Initialize data loader
data_loader = ChicagoCrimeDataLoader()

# Fetch theft crime data (limit can be adjusted)
df = data_loader.fetch_theft_data(limit=5000)

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nColumn Data Types:")
print(df.dtypes)

# Display sample data
df.head()

## Data Overview

Let's get an overview of the dataset and check for missing values.

In [None]:
# Basic statistics
df.describe(include='all')

In [None]:
# Check missing values
missing_values = df.isnull().sum()
missing_percentage = 100 * missing_values / len(df)

missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

missing_data.sort_values('Missing Values', ascending=False).head(10)

## Data Preprocessing

Let's preprocess the data for analysis.

In [None]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Extract datetime features
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.day_name()
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day
df['is_weekend'] = df['date'].dt.dayofweek >= 5

# Convert coordinates to float
for col in ['latitude', 'longitude', 'x_coordinate', 'y_coordinate']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert arrest to boolean
df['arrest'] = df['arrest'].map({'true': True, 'false': False, True: True, False: False})

# Check missing coordinates
missing_coords = df['latitude'].isna() | df['longitude'].isna()
print(f"Records with missing coordinates: {missing_coords.sum()} ({missing_coords.sum()/len(df)*100:.2f}%)")
df = df.dropna(subset=['latitude', 'longitude'])
print(f"Records after dropping missing coordinates: {df.shape[0]}")

## Exploratory Data Analysis

Now, let's explore the data to understand patterns in theft crimes.

In [None]:
# Distribution of theft types
plt.figure(figsize=(10, 6))
theft_counts = df['description'].value_counts()
theft_counts.plot(kind='bar', color='steelblue')
plt.title('Distribution of Theft Types')
plt.xlabel('Theft Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Crimes by hour of day
plt.figure(figsize=(12, 6))
hour_counts = df['hour'].value_counts().sort_index()
sns.barplot(x=hour_counts.index, y=hour_counts.values, color='steelblue')
plt.title('Theft Crimes by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(range(24))
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Crimes by day of week
plt.figure(figsize=(12, 6))
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = df['day_of_week'].value_counts().reindex(day_order)
sns.barplot(x=day_counts.index, y=day_counts.values, color='steelblue')
plt.title('Theft Crimes by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Count')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Crimes by month
plt.figure(figsize=(12, 6))
month_counts = df['month'].value_counts().sort_index()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.barplot(x=[month_names[i-1] for i in month_counts.index], y=month_counts.values, color='steelblue')
plt.title('Theft Crimes by Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Arrest rate for theft crimes
plt.figure(figsize=(10, 6))
arrest_counts = df['arrest'].value_counts()
arrest_percent = 100 * arrest_counts / arrest_counts.sum()

plt.pie(arrest_counts, labels=['Not Arrested', 'Arrested'], autopct='%1.1f%%', startangle=90,
        colors=['#FF9999', '#66B2FF'], wedgeprops={'edgecolor': 'w'})
plt.axis('equal')
plt.title('Arrest Rate for Theft Crimes')
plt.tight_layout()
plt.show()

print(f"Arrest rate: {arrest_percent[True]:.2f}%")

In [None]:
# Top 10 locations for theft crimes
plt.figure(figsize=(12, 8))
location_counts = df['location_description'].value_counts().head(10)
sns.barplot(y=location_counts.index, x=location_counts.values, color='steelblue')
plt.title('Top 10 Locations for Theft Crimes')
plt.xlabel('Count')
plt.ylabel('Location')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Time Series Analysis

Let's analyze how theft crimes have changed over time.

In [None]:
# Set date as index
df_time = df.set_index('date')

# Resample to monthly counts
monthly_counts = df_time.resample('M').size()

# Plot time series
plt.figure(figsize=(14, 7))
monthly_counts.plot()
plt.title('Monthly Theft Crimes')
plt.xlabel('Date')
plt.ylabel('Number of Crimes')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Spatial Analysis

Let's examine the spatial distribution of theft crimes.

In [None]:
# Drop rows with missing coordinates
df_map = df.dropna(subset=['latitude', 'longitude'])

# Plot heatmap (if using folium)
try:
    import folium
    from folium.plugins import HeatMap
    
    # Chicago coordinates
    chicago_coords = [41.8781, -87.6298]
    
    # Create map
    m = folium.Map(location=chicago_coords, zoom_start=11, tiles='CartoDB positron')
    
    # Add heatmap
    heat_data = df_map[['latitude', 'longitude']].values.tolist()
    HeatMap(heat_data, radius=15, blur=10, max_zoom=13).add_to(m)
    
    # Display map
    m.save('theft_heatmap.html')
except ImportError:
    # If folium is not available, use scatter plot
    plt.figure(figsize=(12, 10))
    plt.scatter(df_map['longitude'], df_map['latitude'], alpha=0.5, s=5)
    plt.title('Spatial Distribution of Theft Crimes')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

## Correlation Analysis

Let's examine correlations between features.

In [None]:
# Select numerical columns
numerical_cols = ['hour', 'month', 'year', 'day', 'is_weekend', 'arrest']
numerical_df = df[numerical_cols]

# Compute correlation matrix
corr_matrix = numerical_df.corr()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Features')
plt.tight_layout()
plt.show()

## Arrest Analysis

Let's analyze factors related to arrests.

In [None]:
# Arrest rate by hour
arrest_by_hour = df.groupby('hour')['arrest'].mean().sort_index()

plt.figure(figsize=(12, 6))
sns.lineplot(x=arrest_by_hour.index, y=arrest_by_hour.values, marker='o')
plt.title('Arrest Rate by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Arrest Rate')
plt.xticks(range(24))
plt.ylim(0, arrest_by_hour.max() * 1.1)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Arrest rate by day of week
arrest_by_day = df.groupby('day_of_week')['arrest'].mean()
arrest_by_day = arrest_by_day.reindex(day_order)

plt.figure(figsize=(12, 6))
sns.barplot(x=arrest_by_day.index, y=arrest_by_day.values, color='steelblue')
plt.title('Arrest Rate by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Arrest Rate')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Arrest rate by theft type
arrest_by_type = df.groupby('description')['arrest'].mean()

plt.figure(figsize=(10, 6))
sns.barplot(x=arrest_by_type.index, y=arrest_by_type.values, color='steelblue')
plt.title('Arrest Rate by Theft Type')
plt.xlabel('Theft Type')
plt.ylabel('Arrest Rate')
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Summary of Findings

Key insights from the exploratory data analysis:

1. Temporal patterns: [Your observations about time patterns]
2. Spatial distribution: [Your observations about location patterns]
3. Arrest rates: [Your observations about arrests]
4. Location types: [Your observations about common locations]

These insights will guide our feature engineering and model development in the next phase.