# Chicago Crime Analysis - Feature Engineering and Model Preparation

This notebook focuses on feature engineering, data preparation for modeling, and initial model setup based on our exploratory data analysis from notebook 1.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import sys

# Add project root to path for imports
sys.path.append(os.path.abspath('..'))

from src.data.data_loader import ChicagoCrimeDataLoader
from src.data.data_preprocessor import CrimeDataPreprocessor
from src.features.feature_engineering import CrimeFeatureEngineering

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams.update({'font.size': 12})

## 1. Load the Preprocessed Data

First, we'll load the theft crime data that we analyzed in notebook 1.

In [None]:
# Find the most recent data file
data_dir = 'data'
data_files = [f for f in os.listdir(data_dir) if f.startswith('chicago_theft_data_') and f.endswith('.csv')]

if data_files:
    latest_file = max(data_files, key=lambda x: os.path.getctime(os.path.join(data_dir, x)))
    print(f"Loading data from {latest_file}")
    
    df = pd.read_csv(os.path.join(data_dir, latest_file))
    
    # Convert date to datetime
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        
    print(f"Dataset shape: {df.shape}")
    print(df.head())

## 2. Feature Engineering

Now we'll apply systematic feature engineering to create meaningful features for our models.

In [None]:
# Initialize feature engineering class
feature_eng = CrimeFeatureEngineering()

# Add temporal features
print("Adding temporal features...")
df_temp = feature_eng.add_temporal_features(df)

# Show new temporal features
new_temp_features = set(df_temp.columns) - set(df.columns)
print(f"Added {len(new_temp_features)} temporal features:")
print(sorted(new_temp_features))

# Preview the data with new features
df_temp[list(new_temp_features)].head()

print(df_temp.shape)

In [None]:
# Add spatial features
print("Adding spatial features...")
df_spatial = feature_eng.add_spatial_features(df_temp)

# Show new spatial features
new_spatial_features = set(df_spatial.columns) - set(df_temp.columns)
print(f"Added {len(new_spatial_features)} spatial features:")
print(sorted(new_spatial_features))

# Preview the data with new features
df_spatial[list(new_spatial_features)].head()

print(df_spatial.shape)

In [None]:
# Add crime-specific features
print("Adding crime-specific features...")
df_featured = feature_eng.add_crime_specific_features(df_spatial)

# Show new crime-specific features
new_crime_features = set(df_featured.columns) - set(df_spatial.columns)
print(f"Added {len(new_crime_features)} crime-specific features:")
print(sorted(new_crime_features))

# Preview the data with new features
df_featured[list(new_crime_features)].head()#

print(df_featured.shape)

## 3. Feature Importance Analysis

Let's examine the relationship between our engineered features and the target variable (arrest) using feature importance techniques.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Prepare data for feature importance analysis
# Select only numeric columns and fill missing values
numeric_cols = df_featured.select_dtypes(include=['number']).columns.tolist()
X_numeric = df_featured[numeric_cols].fillna(0)

# Convert target to binary
if 'arrest' in df_featured.columns:
    y = df_featured['arrest'].map({True: 1, False: 0})
    
    # Drop target column from features
    X_numeric = X_numeric.drop('arrest', axis=1, errors='ignore')
    
    # Train a simple Random Forest for feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_numeric, y)
    
    # Get feature importances
    importances = rf.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': X_numeric.columns,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Plot top 15 feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
    plt.title('Top 15 Feature Importances for Predicting Arrests')
    plt.tight_layout()
    plt.show()

## 4. Data Preparation for Classification (Arrest Prediction)

Now, let's prepare the data for our classification task of predicting whether an arrest will be made.

In [None]:
# Initialize preprocessor
preprocessor = CrimeDataPreprocessor()

# Preprocess data for classification
X_train, X_test, y_train, y_test, feature_names = preprocessor.preprocess_classification_data(
    df_featured, target='arrest', test_size=0.2, random_state=42
)

print("Classification data preparation:")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"Class distribution in training set: \n{y_train.value_counts(normalize=True)}")
print(f"Class distribution in testing set: \n{y_test.value_counts(normalize=True)}")

### Addressing Class Imbalance

From our EDA, we observed that arrests are very rare (only about 0.3% of cases). Let's address this class imbalance using various techniques.

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = X_train.select_dtypes(exclude=['number']).columns
numeric_cols = X_train.select_dtypes(include=['number']).columns

# One-hot encode all categorical features
if len(categorical_cols) > 0:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_cats = encoder.fit_transform(X_train[categorical_cols])
    
    # Create encoded feature names
    encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
    
    # Create a DataFrame with encoded features
    encoded_df = pd.DataFrame(encoded_cats, columns=encoded_feature_names, index=X_train.index)
    
    # Combine with numeric features
    X_train_encoded = pd.concat([X_train[numeric_cols], encoded_df], axis=1)
else:
    X_train_encoded = X_train.copy()

# Now apply SMOTE on the fully encoded dataset
smote = SMOTE(sampling_strategy=0.8, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_encoded, y_train)

# Apply combination of over and under sampling
over = SMOTE(sampling_strategy=0.1, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.8, random_state=42)
steps = [('over', over), ('under', under)]
pipeline = Pipeline(steps=steps)
X_train_combined, y_train_combined = pipeline.fit_resample(X_train_encoded, y_train)

# Print class distribution after resampling
print("Original class distribution:")
print(Counter(y_train))

print("\nAfter SMOTE oversampling:")
print(Counter(y_train_smote))

print("\nAfter combined over and under sampling:")
print(Counter(y_train_combined))

## 5. Data Preparation for Time Series (Crime Count Prediction)

Now, let's prepare the data for our time series task of predicting crime counts over time.

In [None]:
# Preprocess data for time series (weekly aggregation)
ts_data_weekly = preprocessor.preprocess_time_series_data(df_featured, freq='W', seq_length=8)

print("Time series data preparation (weekly):")
print(f"X_train shape: {ts_data_weekly['X_train'].shape}")  # [samples, sequence_length, features]
print(f"X_test shape: {ts_data_weekly['X_test'].shape}")

# Also prepare monthly aggregation
ts_data_monthly = preprocessor.preprocess_time_series_data(df_featured, freq='M', seq_length=6)

print("\nTime series data preparation (monthly):")
print(f"X_train shape: {ts_data_monthly['X_train'].shape}")
print(f"X_test shape: {ts_data_monthly['X_test'].shape}")

### Visualize the Prepared Time Series Data

In [None]:
# Plot original weekly data
plt.figure(figsize=(14, 6))
plt.plot(ts_data_weekly['original_data']['date'], ts_data_weekly['original_data']['crime_count'])
plt.title('Weekly Theft Crime Counts')
plt.xlabel('Date')
plt.ylabel('Number of Crimes')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Plot train-test split for weekly data
train_size = len(ts_data_weekly['X_train'])
total_size = train_size + len(ts_data_weekly['X_test'])
dates = ts_data_weekly['dates'][:total_size]
original_values = ts_data_weekly['original_data']['crime_count'].values[8:8+total_size]

plt.figure(figsize=(14, 6))
plt.plot(dates[:train_size], original_values[:train_size], label='Training Data')
plt.plot(dates[train_size:], original_values[train_size:], label='Testing Data')
plt.title('Train-Test Split for Weekly Crime Counts')
plt.xlabel('Date')
plt.ylabel('Number of Crimes')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Initial Model Setup

Let's set up our initial models for both tasks.

### 6.1 Classification Models (Arrest Prediction)

In [None]:

def align_features(X_train, X_test):
    """
    Align features between training and test datasets
    
    Parameters:
    -----------
    X_train : pandas DataFrame
        Training features
    X_test : pandas DataFrame
        Test features
    
    Returns:
    --------
    X_train_aligned, X_test_aligned : aligned DataFrames
    """
    # Identify common features
    common_features = list(set(X_train.columns) & set(X_test.columns))
    
    # Print out missing features for debugging
    train_missing = set(X_train.columns) - set(common_features)
    test_missing = set(X_test.columns) - set(common_features)
    
    if train_missing:
        print("Features in training set but not in test set:")
        print(train_missing)
    
    if test_missing:
        print("Features in test set but not in training set:")
        print(test_missing)
    
    # Align DataFrames
    X_train_aligned = X_train[common_features]
    X_test_aligned = X_test[common_features]
    
    return X_train_aligned, X_test_aligned
# Align features first
X_train_combined_aligned, X_test_aligned = align_features(X_train_combined, X_test)

print("Aligned training features shape:", X_train_combined_aligned.shape)
print("Aligned test features shape:", X_test_aligned.shape)

# Verify alignment
print("\nTraining features:", X_train_combined_aligned.columns.tolist())
print("\nTest features:", X_test_aligned.columns.tolist())

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
# Initialize and train models on the resampled data
# Make sure models are initialized with the same parameters
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=None, 
    min_samples_split=2, 
    random_state=42
)
gb_model = GradientBoostingClassifier(
    n_estimators=100, 
    learning_rate=0.1, 
    max_depth=3, 
    random_state=42
)
# Train models on aligned features
rf_model.fit(X_train_combined_aligned, y_train_combined)
gb_model.fit(X_train_combined_aligned, y_train_combined)

# Make predictions on aligned test data
rf_pred = rf_model.predict(X_test_aligned)
gb_pred = gb_model.predict(X_test_aligned)

# Calculate probabilities for ROC AUC
rf_proba = rf_model.predict_proba(X_test_aligned)[:, 1]
gb_proba = gb_model.predict_proba(X_test_aligned)[:, 1]

# Evaluate models
print("\nRandom Forest - Classification Report:")
print(classification_report(y_test, rf_pred))
print(f"ROC AUC: {roc_auc_score(y_test, rf_proba):.4f}")

print("\nGradient Boosting - Classification Report:")
print(classification_report(y_test, gb_pred))
print(f"ROC AUC: {roc_auc_score(y_test, gb_proba):.4f}")

# Plot confusion matrices
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

sns.heatmap(confusion_matrix(y_test, rf_pred), annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title('Random Forest Confusion Matrix')
ax1.set_xlabel('Predicted Label')
ax1.set_ylabel('True Label')

sns.heatmap(confusion_matrix(y_test, gb_pred), annot=True, fmt='d', cmap='Blues', ax=ax2)
ax2.set_title('Gradient Boosting Confusion Matrix')
ax2.set_xlabel('Predicted Label')
ax2.set_ylabel('True Label')

plt.tight_layout()
plt.show()

### 6.2 Feature Importance from the Best Model

In [None]:
# Get the feature names used during model training
# Since we used aligned features, we need to use the columns from the aligned training data
feature_names = X_train_combined_aligned.columns.tolist()

# Extract feature importances from the Gradient Boosting model
gb_importances = gb_model.feature_importances_

# Create feature importance DataFrame
gb_feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': gb_importances
}).sort_values('Importance', ascending=False)

# Plot top 15 feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=gb_feature_importance.head(15))
plt.title('Top 15 Feature Importances (Gradient Boosting)')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

# Optional: Print top 15 features and their importance
print("Top 15 Most Important Features:")
print(gb_feature_importance.head(15))

### 6.3 Setup for Deep Learning Models

Let's prepare the structures for our deep learning models.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Set random seed for reproducibility
torch.manual_seed(42)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# MLP model architecture for classification
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64], dropout_rate=0.3):
        super(MLPClassifier, self).__init__()
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(nn.Linear(input_dim, hidden_dims[0]))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.BatchNorm1d(hidden_dims[0]))
        self.layers.append(nn.Dropout(dropout_rate))
        
        # Hidden layers
        for i in range(len(hidden_dims) - 1):
            self.layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.BatchNorm1d(hidden_dims[i+1]))
            self.layers.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.layers.append(nn.Linear(hidden_dims[-1], 1))
        self.layers.append(nn.Sigmoid())
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# LSTM model architecture for time series
class LSTMPredictor(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, dropout=0.2):
        super(LSTMPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: batch_size, seq_length, hidden_size
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Print model architectures
input_dim = X_train.shape[1]
mlp_model = MLPClassifier(input_dim=input_dim).to(device)
print("MLP Model Architecture:")
print(mlp_model)

lstm_model = LSTMPredictor().to(device)
print("\nLSTM Model Architecture:")
print(lstm_model)

## 7. Save Prepared Data for Modeling

Finally, let's save the prepared data for use in our next notebook focused on model training and evaluation.

In [None]:
import joblib

# Create directory for processed data
processed_dir = 'data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save classification data
classification_data = {
    'X_train': X_train_combined,
    'X_test': X_test_aligned,
    'y_train': y_train_combined,
    'y_test': y_test,
    'X_train_combined': X_train_combined_aligned,
    'y_train_combined': y_train_combined,
    'feature_names': feature_names
}

joblib.dump(classification_data, os.path.join(processed_dir, 'classification_data.joblib'))
print(f"Classification data saved to {os.path.join(processed_dir, 'classification_data.joblib')}")

# Save time series data
joblib.dump(ts_data_weekly, os.path.join(processed_dir, 'time_series_weekly_data.joblib'))
print(f"Weekly time series data saved to {os.path.join(processed_dir, 'time_series_weekly_data.joblib')}")

joblib.dump(ts_data_monthly, os.path.join(processed_dir, 'time_series_monthly_data.joblib'))
print(f"Monthly time series data saved to {os.path.join(processed_dir, 'time_series_monthly_data.joblib')}")

## 8. Summary of Findings and Next Steps

In this notebook, we have:

1. Conducted feature engineering to create meaningful features for our models
2. Analyzed feature importance to understand which factors most strongly influence arrests
3. Prepared data for both classification (arrest prediction) and time series (crime count prediction) tasks
4. Addressed the class imbalance issue through resampling techniques
5. Set up initial machine learning models and evaluated their performance
6. Prepared the architecture for deep learning models

Key insights so far:

- We've confirmed the severe class imbalance in the arrest data (only about 0.3% of theft cases result in arrests)
- The most important features for predicting arrests appear to be related to [will be filled based on actual results]
- Our initial machine learning models achieved [will be filled based on actual results]

Next steps in notebook 3:

1. Train and optimize our deep learning models (MLP for classification, LSTM for time series)
2. Perform hyperparameter tuning for all models to improve performance
3. Compare the performance of traditional machine learning vs. deep learning approaches
4. Conduct in-depth error analysis to understand model limitations
5. Develop final predictions and create visualizations to communicate results
