In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
import sqlite3
import os

# Connect to the SQLite database
conn = sqlite3.connect('..\db\incidents.db')

# Load data from the database
query = "SELECT * FROM incidents WHERE incident_datetime >= '2009-01-01'"
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Preprocess the data
df['incident_datetime'] = pd.to_datetime(df['incident_datetime'])
df['day_of_week'] = df['incident_datetime'].dt.dayofweek
df['hour'] = df['incident_datetime'].dt.hour

# Prepare features and target
X = df[['day_of_week', 'hour', 'neighborhood']]
y = df['incident_type_primary']

# Encode categorical variables
le_neighborhood = LabelEncoder()
le_incident_type = LabelEncoder()

X['neighborhood'] = le_neighborhood.fit_transform(X['neighborhood'])
y = le_incident_type.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get the current working directory
current_dir = os.getcwd()

# Create the 'data/risk' folder if it doesn't exist
risk_directory = os.path.join(current_dir, '..', 'data', 'risk')
if not os.path.exists(risk_directory):
    os.makedirs(risk_directory)

# Save the model and label encoders in the 'data/risk' folder
crime_risk_model_path = os.path.join(risk_directory, 'crime_risk_model.pkl')
joblib.dump(rf, crime_risk_model_path)

le_neighborhood_path = os.path.join(risk_directory, 'le_neighborhood.pkl')
joblib.dump(le_neighborhood, le_neighborhood_path)

le_incident_type_path = os.path.join(risk_directory, 'le_incident_type.pkl')
joblib.dump(le_incident_type, le_incident_type_path)
print("Model and encoders saved successfully.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['neighborhood'] = le_neighborhood.fit_transform(X['neighborhood'])


Model and encoders saved successfully.
