In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Data preprocessing with Label encoding
X = pd.read_csv("train.csv")
y = X['Crime_Category']
X = X.drop('Crime_Category', axis=1)

# Remove column with 80% null values
X.drop('Cross_Street', axis=1, inplace=True)

# Handle missing data
X['Victim_Sex'] = X['Victim_Sex'].replace(['H', 'X'], 'Unknown')
X['Victim_Descent'] = X['Victim_Descent'].fillna('Unknown')
X['Weapon_Description'] = X['Weapon_Description'].fillna('No Weapon')
X['Weapon_Used_Code'] = X['Weapon_Used_Code'].fillna(0)  # Weapon_Used_Code is in the range [1,3990], 0 is for missing code
X['Modus_Operandi'] = X['Modus_Operandi'].fillna('Unknown')

# Date processing
X['Date_Reported'] = pd.to_datetime(X['Date_Reported'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
X['Date_Occurred'] = pd.to_datetime(X['Date_Occurred'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
X['Year_Reported'] = X.Date_Reported.dt.year
X['Year_Occurred'] = X.Date_Occurred.dt.year
X['Month_Reported'] = X.Date_Reported.dt.month
X['Month_Occurred'] = X.Date_Occurred.dt.month
X['Day_Reported'] = X.Date_Reported.dt.day
X['Day_Occurred'] = X.Date_Occurred.dt.day
X.drop(['Date_Reported', 'Date_Occurred'], axis=1, inplace=True)

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
categorical_columns = [col for col in categorical_columns if col != 'Modus_Operandi']

numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

def label_encoding_column(df, column):
    lab_encoder = LabelEncoder()
    df[column] = lab_encoder.fit_transform(df[column])
    return df

for col in categorical_columns:
  X = label_encoding_column(X, col)

modus_operandi_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    CountVectorizer(preprocessor=lambda x:x[0])
)

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_columns),
    ('modus_operandi', modus_operandi_pipeline, ['Modus_Operandi'])
  ])

In [3]:
# full pipeline
pipe = make_pipeline(preprocessor)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = pipe.transform(X_test)

# label encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [9]:
# convert from sparse to dense matrices
X_train = X_train.toarray()
X_test = X_test.toarray()

# convert all the data in pytorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

# data loader for training and testing
training_data = TensorDataset(X_train_tensor, y_train_tensor)
testing_data = TensorDataset(X_test_tensor, y_test_tensor)

training_loader = DataLoader(training_data, batch_size=64, shuffle=True)
testing_loader = DataLoader(testing_data, batch_size=64, shuffle=False)

In [10]:
# define the neural network
class CrimeCategoryNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=6):
        super(CrimeCategoryNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.2)

        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.dropout2 = nn.Dropout(p=0.3)

        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # first layer
        x = self.dropout1(self.relu(self.bn1(self.fc1(x))))
        # second layer
        x = self.dropout2(self.relu(self.bn2(self.fc2(x))))
        # output layer
        x = self.fc3(x)
        return x

# number of features in the training set
input_dim = X_train.shape[1]
model = CrimeCategoryNN(input_dim=input_dim)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
# training the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in training_loader:
        # zero the gradients
        optimizer.zero_grad()
        # forward pass
        outputs = model(inputs)
        # compute loss
        loss = criterion(outputs, labels)
        # backward pass
        loss.backward()
        # update weights
        optimizer.step()

        running_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(training_loader):.4f}')

Epoch 1, Loss: 0.5047
Epoch 2, Loss: 0.2097
Epoch 3, Loss: 0.1774
Epoch 4, Loss: 0.1636
Epoch 5, Loss: 0.1500
Epoch 6, Loss: 0.1422
Epoch 7, Loss: 0.1328
Epoch 8, Loss: 0.1267
Epoch 9, Loss: 0.1225
Epoch 10, Loss: 0.1124


In [12]:
# evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in testing_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Accuracy on the test images: {100 * correct / total:.2f}%')

Accuracy on the test images: 94.80%
