# Crime Prediction using Random Forest
Updated with MSE, RMSE, and Cross-Validation

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

# Load the dataset
df = pd.read_csv('A_train_balanced.csv')

# Drop irrelevant columns
df = df.drop(columns=['CCN', 'REPORT_DAT', 'OFFENSE', 'BLOCK', 'VOTING_PRECINCT', 'BID',
                      'START_DATE', 'END_DATE', 'OBJECTID', 'OCTO_RECORD_ID'])

# Drop rows with missing target
df = df.dropna(subset=['crime_label'])

# Prepare features and target
target = df['crime_label']
features = df.drop(columns=['crime_label'])

# Handle missing values for numeric columns
features_numeric = features.select_dtypes(include=[np.number]).fillna(features.median(numeric_only=True))

# One-hot encode categorical columns
categorical_cols = features.select_dtypes(include=['object']).columns
features_categorical = pd.get_dummies(features[categorical_cols], drop_first=True)

# Combine numeric and categorical features
features_final = pd.concat([features_numeric, features_categorical], axis=1)

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(features_final, target, test_size=0.2, random_state=42)

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_val)

# Metrics
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
report = classification_report(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Accuracy Score:", accuracy)
print("Classification Report:\n", report)

# Cross-validation
cv_scores = cross_val_score(rf, features_final, target, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))
print("Standard Deviation CV Accuracy:", np.std(cv_scores))
