In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [None]:
train_df = pd.read_csv("train.csv")
# Step 1: Data preprocessing
# Encode the categorical target variable
label_encoder = LabelEncoder()
train_df['Crime_Category'] = label_encoder.fit_transform(train_df['Crime_Category'])

# Separate features and target variable
X = train_df.drop(columns=['Crime_Category'])
y = train_df['Crime_Category']

# Split features into numerical and categorical columns
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Handle missing values
X[numerical_columns] = X[numerical_columns].fillna(X[numerical_columns].median())
X[categorical_columns] = X[categorical_columns].fillna('Unknown')

# Convert categorical features to numeric
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Configure and train KNN
# Use GridSearchCV to find the optimal value of k
param_grid = dict(
    {
        'n_estimators': [42,44,46,48],
        'max_depth': [8,9,10]
    }
)
grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(accuracy_score(y_test, y_pred))

Accuracy: 91.475%