## Crime Prediction using Random Forest Classifier

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

# Load the dataset
df = pd.read_csv('A_train_balanced.csv')
df.head()


## Train-Test splitting

In [None]:

# Select features and target
X = df[['Day_Shift', 'Place']]
y = df['Crime_Occurred']

# Convert categorical to numeric if needed
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## looking for correlations

In [None]:

plt.figure(figsize=(8,6))
corr = X_train.join(y_train).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()


## trying out attribute information

In [None]:

y_train.value_counts().plot(kind='bar')
plt.title('Target Class Distribution')
plt.show()


## create a pipeline

In [None]:

# Random Forest pipeline
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


## selecting a desired model for Crime Prediction

In [None]:

y_pred = rf.predict(X_val)
print("Classification Report:\n", classification_report(y_val, y_pred))
print("Accuracy Score:", accuracy_score(y_val, y_pred))


## evaluating the model

In [None]:

cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()


## using better evaluation technique - cross validation

In [None]:

cv_scores = cross_val_score(rf, X, y, cv=5)
print("Cross Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


## testing the model on test data

In [None]:

# Feature Importance
importances = rf.feature_importances_
feature_names = X.columns
plt.figure(figsize=(8,5))
sns.barplot(x=importances, y=feature_names, palette="viridis")
plt.title("Feature Importance - Random Forest")
plt.show()

# ROC Curve
y_probs = rf.predict_proba(X_val)[:, 1]
fpr, tpr, _ = roc_curve(y_val, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.legend(loc="lower right")
plt.show()


## using the model

In [None]:

# Example Prediction
sample = X.sample(1, random_state=42)
print("Sample Data:", sample)
print("Predicted Crime Occurrence:", rf.predict(sample))
