In [3]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import joblib
import os


In [4]:
# Establish a connection to the SQLite database
conn = sqlite3.connect('../db/incidents.db')
# Load data
query = "SELECT * FROM incidents"
df = pd.read_sql_query(query, conn)
conn.close()

In [5]:
# Preprocess data
df['incident_datetime'] = pd.to_datetime(df['incident_datetime'])
df['hour'] = df['incident_datetime'].dt.hour
df['day_of_week'] = df['incident_datetime'].dt.dayofweek
df['month'] = df['incident_datetime'].dt.month

  df['incident_datetime'] = pd.to_datetime(df['incident_datetime'])


In [6]:
# Filter columns
df_filtered = df[['neighborhood', 'incident_type_primary', 'hour', 'day_of_week', 'month']]
df_filtered.dropna(inplace=True)  # Ensure no missing values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.dropna(inplace=True)  # Ensure no missing values


In [7]:
current_dir = os.getcwd()
# Directory to save models
model_dir = os.path.join(current_dir, '..', 'data', 'trends')
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

In [8]:
# Train a model for each neighborhood
neighborhoods = df_filtered['neighborhood'].unique()
for neighborhood in neighborhoods:
    data = df_filtered[df_filtered['neighborhood'] == neighborhood]
    X = data[['hour', 'day_of_week', 'month']]
    y = data['incident_type_primary']  # Keep as string

    if len(np.unique(y)) > 1:  # Ensure there is more than one class
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Train SVM
        svm_model = SVC(probability=True, random_state=42)
        svm_model.fit(X_train_scaled, y_train)

        # Save model and scaler
        joblib.dump(svm_model, os.path.join(model_dir, f'svm_{neighborhood}.pkl'))
        joblib.dump(scaler, os.path.join(model_dir, f'scaler_{neighborhood}.pkl'))