# 1. Data cleaning 

In [37]:
import warnings
import itertools

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from torch.utils.data import TensorDataset

# import statsmodels.api as sm
from sklearn.metrics import r2_score

#########################################################################################################
df = pd.read_csv("archive-1/crime_60_100.csv")

numeric_cols = ['violent_crime', 'murder', 'rape', 'robbery', 'population'] 

for col in numeric_cols:
    # if col in numeric_cols:
        df[col] = df[col].astype(str).str.replace(',', '')  
        df[col] = pd.to_numeric(df[col], errors='coerce')  

df_clean = df.dropna(subset=numeric_cols)

# verify cleaned data
print("Cleaned Data Types:")
print(df_clean[numeric_cols].dtypes)
print("\nCleaned DataFrame Shape:", df_clean.shape)

Cleaned Data Types:
violent_crime    float64
murder           float64
rape             float64
robbery          float64
population         int64
dtype: object

Cleaned DataFrame Shape: (214, 12)


# 2. Data processing 

In [11]:
# Calculate per capita crime rates 
features = df_clean[['population', 'states']]
targets = df_clean[['violent_crime', 'murder', 'rape', 'robbery']].div(df_clean['population'], axis=0)

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(
#     features, targets, test_size=0.2, random_state=42
# )

# Preprocessing
preprocessor = ColumnTransformer([
    ('state_encoder', OneHotEncoder(handle_unknown='ignore'), ['states'])
], remainder='passthrough')

X_processed = preprocessor.fit_transform(features) 
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, targets.values, test_size=0.2, random_state=42
)

# convert to tensors
X_train_tensor = torch.FloatTensor(X_train.toarray() if hasattr(X_train, 'toarray') else X_train)
X_test_tensor = torch.FloatTensor(X_test.toarray() if hasattr(X_test, 'toarray') else X_test)
y_train_tensor = torch.FloatTensor(y_train)
y_test_tensor = torch.FloatTensor(y_test)

# 3. PyTorch Model definition¶

In [51]:
class CrimePredictor(nn.Module):
    def __init__(self, input_size):
        super(CrimePredictor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 64), # linear @params
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(), 
            nn.Linear(32, 4) # we want 4 output features
        )
    def forward(self, x):
        return self.net(x)

# 4. Model setup

In [52]:
# get input size from preprocessed data
# input_size = X_train_tensor.shape[1]
# model = CrimePredictor(input_size)
model = CrimePredictor(input_size=X_train_tensor.shape[1]) 

criterion = nn.L1Loss()  # MAE loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# create dataloader
class CrimeDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
        
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 5. Training loop

In [56]:
num_epochs = 200
best_test_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)

    # validate
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor) 
    train_loss = train_loss / len(train_loader.dataset) 

    # early stop check
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), 'best_model.pth')

    if (epoch+1) % 20 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}')  
        print(f'Train MAE: {train_loss:.4f} | Test MAE: {test_loss:.4f}')  

Epoch 20/200
Train MAE: 2.5104 | Test MAE: 1.6287
Epoch 40/200
Train MAE: 1.4737 | Test MAE: 1.6058
Epoch 60/200
Train MAE: 0.9660 | Test MAE: 0.6373
Epoch 80/200
Train MAE: 0.3106 | Test MAE: 0.3872
Epoch 100/200
Train MAE: 0.2913 | Test MAE: 0.2509
Epoch 120/200
Train MAE: 0.1401 | Test MAE: 0.0517
Epoch 140/200
Train MAE: 0.0419 | Test MAE: 0.0448
Epoch 160/200
Train MAE: 0.0500 | Test MAE: 0.0066
Epoch 180/200
Train MAE: 0.0658 | Test MAE: 0.0050
Epoch 200/200
Train MAE: 0.1299 | Test MAE: 0.0087


# 6. Evaluation

In [55]:
# #  Load best model
# model.load_state_dict(torch.load('best_model.pth'))
# model.eval()

# with torch.no_grad():
#     train_pred = model(X_train_tensor).numpy()
#     test_pred = model(X_test_tensor).numpy()

# print(f"\nFinal Train MAE: {mean_absolute_error(y_train, train_pred):.4f}")
# print(f"Final Test MAE: {mean_absolute_error(y_test, test_pred):.4f}")


# # # use for predicting 
# # def predict_crime_rates(population: int, state: str):
# #     input_df = pd.DataFrame([[population, state]], columns=['population', 'states'])
# #     processed = preprocessor.transform(input_df)
# #     input_tensor = torch.FloatTensor(processed)
# #     with torch.no_grad():
# #         prediction = model(input_tensor).numpy()[0]
# #     return {
# #         'violent_crime_rate': prediction[0],
# #         'murder_rate': prediction[1],
# #         'rape_rate': prediction[2],
# #         'robbery_rate': prediction[3]
# #     }