In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

crime_cast_forecasting_crime_categories_path = kagglehub.competition_download('crime-cast-forecasting-crime-categories')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **1. Importing Libraries

In [None]:
# Basic Libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
plt.style.use("seaborn-v0_8-notebook")
plt.rcParams["figure.figsize"] = (10, 6)

# Preprocessing and Imputation required Libraries

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA


# Importing Models

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Importing Model selection libraries

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Importing metrics

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score,roc_auc_score,log_loss,roc_curve,auc
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


# Columns Description:
* Location: Street address of the crime incident.
* Cross_Street: Cross street of the rounded address.
* Latitude: Latitude coordinates of the crime incident.
* Longitude: Longitude coordinates of the crime incident.
* Date_Reported: Date the incident was reported.
* Date_Occurred: Date the incident occurred.
* Time_Occurred: Time the incident occurred in 24-hour military time.
* Area_ID: LAPD's Geographic Area number.
* Area_Name: Name designation of the LAPD Geographic Area.
* Reporting_District_no: Reporting district number.
* Part 1-2: Crime classification.
* Modus_Operandi: Activities associated with the suspect.
* Victim_Age: Age of the victim.
* Victim_Sex: Gender of the victim.
* Victim_Descent: Descent code of the victim.
* Premise_Code: Premise code indicating the location of the crime.
* Premise_Description: Description of the premise code.
* Weapon_Used_Code: Weapon code indicating the type of weapon used.
* Weapon_Description: Description of the weapon code.
* Status: Status of the case.
* Status_Description: Description of the status code.
* Crime_Category: The category of the crime (Target Variable)

In [None]:
# where ever random numbers to be generated, setting this seed helps to generate same random numbers
np.random.seed(219)

# Loading Data

In [None]:
train_df = pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/train.csv')
test_df =  pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/test.csv')
sample =   pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv')

# Exploratory Data Analysis(EDA)

In [None]:
# Check out the first five records of Training Data Frame
train_df.head()

In [None]:
# Lets check the shape of Training Data Frame
train_df.shape

In [None]:
train_df.info()

Using the inputs from the ***train_df.info()*** method and the feature description provided by the Competetion, the segementation of the features is as follow:

**Categorical Features**:
{Location, Cross_Street, Area_ID, Area_Name, Reporting_District_no,
Part 1-2,  Modus_Operandi,  Victim_Sex, Victim_Descent,  Premise_Code,  Premise_Description, Weapon_Used_Code, Weapon_Description, Status, Status_Description, Crime_Category}

**Numerical Features**: {Latitude, Longitude, Victim_Age, Time_Occurred}

**Observations**

1. In the above list there are some features which are containing float values like : Reporting_District_no, Part 1-2, Area_ID, Weapon_Used_Code.
   But these features are still categorical in nature because of the given description.
  
2. Also some of the features are repeatative by providing the same information.

Example: Latitude and Longitude can actually help in providing the information about particular Location, but the same information can be found in Location and Cross_Street fields also.

3. The Target Variable 'Crime Category' is  categorical in nature, thus a classification problem.


In [None]:
# Checking for Null Values in the train_df data frame.

train_df.isna().sum()

**Observation**:  The below features have more than 50% null values in the training dataset.

1. Cross_Street

2. Weapon_Used_Code

3. Weapon_Description  

In [None]:
# Checking the unique values in target variable

train_df['Crime_Category'].unique()

In [None]:
#Lets check the type of target variable

from sklearn.utils.multiclass import type_of_target

y= train_df['Crime_Category']

print(type_of_target(y))

In [None]:
# Visualizing the target variable - 'Crime_Category' using Count Plot Chart

sns.countplot(x="Crime_Category", data=train_df) # helps in giving the frequency of each crime category in Training Data Frame.

plt.xticks(rotation=45)  # Rotate x-axis labels if needed

plt.show()

In [None]:
# Visualizing Crime category distribution by Area_Name using Count Plot Chart.

sns.countplot(x='Area_Name', hue='Crime_Category', data=train_df)

plt.xticks(rotation=45, ha='right')

plt.title('Breakdown of Crime distribution across different Areas')

plt.show()

**Observation**

1. Property Crime seems to be the most frequent type of crime to be committed across all 21 areas. With the Pacific area having the highest crimes committed by Area.

2. Foothill and Hollenback having some of the lowest crimes committed by Area.

In [None]:
# Visualizing the Crime Category distribution by Part 1-2 feature using Count Plot Chart

sns.countplot(x='Part 1-2', hue='Crime_Category', data=train_df)

plt.xticks(rotation=45, ha='right')

plt.title('Crime Category distribution by Part 1-2')

plt.show()

**Observation**:
1. The above plot provides an inference about the relation between Part 1-2 feature and the Target Variable.

2. In Part 1-2 feature the crimes which have been assigned with class 2 contain all the categories of crime.

3. While the crimes assigned to class 1 contain property,violent and crimes against public order; with property crimes being the predominant crime category.

**Imputation**

In [None]:
# null values are filled with constant "0" in train_df

train_df['Weapon_Used_Code'] = train_df['Weapon_Used_Code'].fillna(0)

# null values are filled with constant "0" in test_df

test_df['Weapon_Used_Code'] = test_df['Weapon_Used_Code'].fillna(0)


In [None]:
# Understanding the Relationship between Numerical Features in Training Data Frame using Pairplot

sns.pairplot(train_df,

             diag_kind='kde',# Use kernel density estimation for diagonal plots

             kind='scatter')
plt.show()

**Observations**

1. The above pair plot displays a high density amongst most of the numerical feature variables and lot of sub populations are revealed within some of the features like Part 1-2.

2. While the density/histogram plots presented on the diagonal line suggest that the feature variables might belong to Gaussian family of distributions.

3. Most pairplots between the features hint at a potential redundancy indicating a low variance, which means such features can be removed as they do not increase the predictive power of the model.    

# Preprocessing

#### Transformations to be done feature wise
---------------------------
* Loction is ignored as it is captured in Latitude and Longitude
* Cross_Street is ignored as it is captured in Latitude and Longitude
* Latitude MinMaxScalar
* Longitude MinMaxScalar
* Date_Reported to be modified as Date Object
* Date_Occurred to be modified as Date Object and Difference between two dates ("Time_Difference") is calculated
* Time_Difference MinMaxScalar
* Time_Occurred MinMaxScalar
* Area_ID OneHotEncoding
* Area_Name is ignored, as it is captured in 'Area_ID'
* Reporting_District_no OneHot encoding
* "Part 1-2" no transformation required
* Separate Modus_Operandi column values into 10 different columns and if the activity is present it will have the code, otherwise "0"
* Victim_Age Min Max Scaling
* Victim_Sex, replace 'NaN' with 'U' (Unknown) and do OneHot encoding
* Victim_Descent, replace 'NaN' with 'N'(None) and do OneHot encoding
* Premise_Code , OneHot encoding
* Weapon_Used_Code , replace 'nan' with 0 and d OneHot encoding
* Status , OneHot encoding
* Ignore "Premise_Description", "Weapon_Description", "Status_Description"




In [None]:
# Dropping Target variable from train_df

X = train_df.drop(['Crime_Category'], axis = 'columns') # Feature Matrix

y = train_df['Crime_Category']  # Label Vector

In [None]:
train_df['Modus_Operandi'].head()

* From looking at the samples in Modus_Operandi feature, I notice that feature consists of numerical codes but are of object type (string type).
* Also in each record the length of the string varies with the highest being 10, hence splitting the Modus_Operandi feature into 10 new columns, to accomodate each code.  

In [None]:
# Splitting the Modus_operandi column into 10 separate columns in train_df

train_df[['Modus1', 'Modus2', 'Modus3','Modus4', 'Modus5', 'Modus6','Modus7', 'Modus8', 'Modus9','Modus10']] = train_df['Modus_Operandi'].str.split(' ', expand=True, n=9)

# Imputing the missing values in the 10 new columns with 0 in train_df

train_df[['Modus1', 'Modus2', 'Modus3','Modus4', 'Modus5', 'Modus6','Modus7', 'Modus8', 'Modus9','Modus10']] = train_df[['Modus1', 'Modus2', 'Modus3','Modus4', 'Modus5', 'Modus6','Modus7', 'Modus8', 'Modus9','Modus10']].fillna('0').astype(int)

# Splitting the Modus_operandi column into 10 separate columns in test_df

test_df[['Modus1', 'Modus2', 'Modus3','Modus4', 'Modus5', 'Modus6','Modus7', 'Modus8', 'Modus9','Modus10']] = test_df['Modus_Operandi'].str.split(' ', expand=True, n=9)

# Imputing the missing values in the 10 new columns with 0 in test_df

test_df[['Modus1', 'Modus2', 'Modus3','Modus4', 'Modus5', 'Modus6','Modus7', 'Modus8', 'Modus9','Modus10']] = test_df[['Modus1', 'Modus2', 'Modus3','Modus4', 'Modus5', 'Modus6','Modus7', 'Modus8', 'Modus9','Modus10']].fillna('0').astype(int)

In [None]:
# Converting the Date_Reported column into datetime object.

train_df['Date_Reported'] = pd.to_datetime(train_df['Date_Reported']) # Train Data Frame

test_df['Date_Reported'] = pd.to_datetime(test_df['Date_Reported']) # Test Data Frame

# Converting the Date_Occured column into datetime object.

train_df['Date_Occurred'] = pd.to_datetime(train_df['Date_Occurred']) # Train Data Frame

test_df['Date_Occurred'] = pd.to_datetime(test_df['Date_Occurred'])  # Test Data Frame

# Creating a new column to capture the time difference.

train_df.loc[:,'Time_Difference'] = (train_df['Date_Reported'] - train_df['Date_Occurred']).dt.days # Train Data Frame

test_df.loc[:,'Time_Difference'] = (test_df['Date_Reported'] - test_df['Date_Occurred']).dt.days   # Test Data Frame

In [None]:
# Dropping Unrequired Columns in both Train and Test Data Frames to decrease redundancy

train_df = train_df.drop(["Location", "Cross_Street", "Area_Name","Date_Reported", "Date_Occurred", "Modus_Operandi", "Premise_Description", "Weapon_Description", "Status_Description","Crime_Category"], axis=1)

test_df = test_df.drop(["Location", "Cross_Street", "Area_Name", "Date_Reported", "Date_Occurred", "Modus_Operandi", "Premise_Description", "Weapon_Description", "Status_Description"], axis=1)

In [None]:
combined_train_test_df=pd.concat([train_df,test_df], ignore_index=True)

**Note** :

Due to OneHotEncoding there will be mismatch in the number of dummy features generated in training and test data sets,

which will be causing issues when submitting the mode.

In [None]:
preprocessing_pipeline = ColumnTransformer(transformers = [('MinMax',MinMaxScaler(),['Time_Difference','Time_Occurred','Victim_Age','Latitude','Longitude']),
                                                                ('StanScalar',StandardScaler(),['Modus1', 'Modus2', 'Modus3','Modus4', 'Modus5', 'Modus6','Modus7', 'Modus8', 'Modus9','Modus10']),
                                                           ('VicGen', Pipeline([('imputer',SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='U')),('encoder',OneHotEncoder(handle_unknown='ignore'))]),['Victim_Sex']),
   ('VicDes',Pipeline([('imputer',SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='N')),('encoder',OneHotEncoder(handle_unknown='ignore'))]),['Victim_Descent']),
   ('encoder',OneHotEncoder(handle_unknown='ignore'),['Area_ID','Reporting_District_no','Premise_Code','Weapon_Used_Code','Status'])],remainder = 'passthrough')

In [None]:
train_transformed = preprocessing_pipeline.fit_transform(train_df) # Validation Set

combined_train_test = preprocessing_pipeline.fit_transform(combined_train_test_df) # Original Train and Test combined Set

In [None]:
# Creating Original Training and Testing sets in encoded format

train_encoded = combined_train_test[:len(train_df)]

test_encoded = combined_train_test[len(train_df):]

In [None]:
# Splitting the train_transformed into Train and Validation sets.

X_train, X_val, y_train, y_val = train_test_split(train_transformed, y, test_size=0.33, random_state=42)

# Feature Engineering

## 1. Feature Extraction using PCA

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.toarray())

pca = PCA(n_components=0.25)
pca.fit(X_train_scaled)

X_train_pca = pca.transform(X_train_scaled)
X_val_scaled = scaler.transform(X_val.toarray())
X_val_pca = pca.transform(X_val_scaled)

## 2. Feature Selection using SelectKBest

In [None]:
# Select K best features

selector = SelectKBest(score_func=mutual_info_classif, k=1000)
selector.fit(X_train, y_train)

# Transform both training and validation sets
X_train_selected = selector.transform(X_train)
X_val_selected = selector.transform(X_val)


In [None]:
#Label Encoding

y_reshaped = y.values.reshape(-1,1)  # Reshaping Original Label

y_train_reshaped = y_train.values.reshape(-1, 1) # Reshaping Train Label

y_val_reshaped = y_val.values.reshape(-1, 1) # Reshaping Validation Label

y_encoder = OneHotEncoder(sparse_output=False)  # Creating OneHotEncoder instance

y_transformed = y_encoder.fit_transform(y_reshaped) # Fitting Encoder on Original Label

y_train_transformed = y_encoder.fit_transform(y_train_reshaped) # Fitting Encoder on Train Label

y_val_transformed = y_encoder.fit_transform(y_val_reshaped) # Fitting Encoder on Validation Label

y_single_label = np.argmax(y_transformed, axis=1) # Transformed Orginal Label

y_train_single_label = np.argmax(y_train_transformed, axis=1) # Transformed Train Label

y_val_single_label = np.argmax(y_val_transformed, axis=1) # Transformed Validation Label

# MODELS

1. Logistic Regression

2. KNN Model

3. Decission Tree Classifier

4. Bagging Classifier Model

5. Multi Layer Perceptron (MLP)

6. Gradient Boosting Classifier (Best scoring model - Final submission for competition)

--------------------------------------------

## Logistic Regression

In [None]:
# Define the hyperparameter tuning using Random Search CV
param_dist = {'solver': ['liblinear', 'newton-cg'],
              'C': np.logspace(-4, 4, 5)}  # Logarithmic distribution
reg_model = LogisticRegression()
# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(reg_model, param_dist, n_iter=1000, cv=5)

# Fit the random search to the training data

random_search.fit(X_train, y_train_single_label)

best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_score = random_search.best_score_

print(best_params)
print(best_model)
print(best_score)

***Result*** :

After conducting RandomSearchCV on the Logistic Regression model, these were the best parameters :

{penalty="l2", C=1, multi_class='ovr', solver='newton-cg', max_iter=2000}

In [None]:
# Validation Metrics without Feature Engineering

reg_model1 = LogisticRegression(penalty="l2", C=1, multi_class='ovr', solver='newton-cg', max_iter=2000)

reg_model1.fit(X_train, y_train_single_label)

pred_Logreg = reg_model1.predict(X_val)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_original = [label_mapping[pred] for pred in pred_Logreg]


accuracy_Log = accuracy_score(y_val, y_pred_original)
precision_Log = precision_score(y_val, y_pred_original,average='weighted')
recall_Log = recall_score(y_val, y_pred_original,average='weighted')
f1_Log = f1_score(y_val, y_pred_original,average='weighted')

conf_matrix = confusion_matrix(y_val, y_pred_original)

print("Accuracy:", accuracy_Log)
print("Precision:", precision_Log)
print("Recall:", recall_Log)
print("F1-score:",f1_Log)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
# Validation Metrics with Feature Engineering (PCA)

reg_model2 = LogisticRegression(penalty="l2", C=1, multi_class='ovr', solver='newton-cg', max_iter=2000)

reg_model2.fit(X_train_pca, y_train_single_label)

pred_Logreg = reg_model2.predict(X_val_pca)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_original = [label_mapping[pred] for pred in pred_Logreg]


accuracy = accuracy_score(y_val, y_pred_original)
precision = precision_score(y_val, y_pred_original,average='weighted')
recall = recall_score(y_val, y_pred_original,average='weighted')
f1 = f1_score(y_val, y_pred_original,average='weighted')

conf_matrix = confusion_matrix(y_val, y_pred_original)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:",f1)
print("Confusion Matrix:\n", conf_matrix)


In [None]:
# Validation Metrics with Feature Engineering (SelectKBest)

reg_model3 = LogisticRegression(penalty="l2", C=1, multi_class='ovr', solver='newton-cg', max_iter=2000)

reg_model3.fit(X_train_selected, y_train_single_label)

pred_Logreg = reg_model3.predict(X_val_selected)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_original = [label_mapping[pred] for pred in pred_Logreg]


accuracy = accuracy_score(y_val, y_pred_original)
precision = precision_score(y_val, y_pred_original,average='weighted')
recall = recall_score(y_val, y_pred_original,average='weighted')
f1 = f1_score(y_val, y_pred_original,average='weighted')

conf_matrix = confusion_matrix(y_val, y_pred_original)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:",f1)
print("Confusion Matrix:\n", conf_matrix)


-----------------------------------------------------------

## KNN Model

In [None]:
# Define the hyperparameter tuning using Random Search CV

param_grid = {
    'n_neighbors': [9, 11, 15],
    'metric': ['euclidean', 'manhattan']
}
knn_model = KNeighborsClassifier()
random_search = RandomizedSearchCV(knn_model, param_grid, cv=5, n_iter=100)
random_search.fit(X_train, y_train_single_label)

best_random_params = random_search.best_params_
best_random_score = random_search.best_score_

In [None]:
# Validation Metrics without Feature Engineering

knn_model =  KNeighborsClassifier(n_neighbors=15, metric = 'euclidean')
knn_model.fit(X_train, y_train_single_label)
y_pred_knn = knn_model.predict(X_val)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_original = [label_mapping[pred] for pred in y_pred_knn]


accuracy_knn = accuracy_score(y_val, y_pred_original)
precision_knn = precision_score(y_val, y_pred_original,average='weighted')
recall_knn = recall_score(y_val, y_pred_original,average='weighted')
f1_knn = f1_score(y_val, y_pred_original,average='weighted')

conf_matrix = confusion_matrix(y_val, y_pred_original)

print("Accuracy:", accuracy_knn)
print("Precision:", precision_knn)
print("Recall:", recall_knn)
print("F1-score:",f1_knn)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
# Validation Metrics with Feature Engineering (PCA)

knn_model =  KNeighborsClassifier(n_neighbors=15, metric = 'euclidean')
knn_model.fit(X_train_pca, y_train_single_label)
y_pred_knn = knn_model.predict(X_val_pca)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_original = [label_mapping[pred] for pred in y_pred_knn]


accuracy = accuracy_score(y_val, y_pred_original)
precision = precision_score(y_val, y_pred_original,average='weighted')
recall = recall_score(y_val, y_pred_original,average='weighted')
f1 = f1_score(y_val, y_pred_original,average='weighted')

conf_matrix = confusion_matrix(y_val, y_pred_original)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:",f1)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
# Validation Metrics with Feature Engineering (SelectKBest)

knn_model =  KNeighborsClassifier(n_neighbors=15, metric = 'euclidean')
knn_model.fit(X_train_selected, y_train_single_label)
y_pred_knn = knn_model.predict(X_val_selected)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_original = [label_mapping[pred] for pred in y_pred_knn]


accuracy = accuracy_score(y_val, y_pred_original)
precision = precision_score(y_val, y_pred_original,average='weighted')
recall = recall_score(y_val, y_pred_original,average='weighted')
f1 = f1_score(y_val, y_pred_original,average='weighted')

conf_matrix = confusion_matrix(y_val, y_pred_original)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:",f1)
print("Confusion Matrix:\n", conf_matrix)

----------------------------------------------------
## Decision Tree Classifier Model

In [None]:
# Create a decision tree classifier without Feature Engineering

X_dense = X_train.toarray()
X_val_dense = X_val.toarray()

clf = DecisionTreeClassifier(criterion = 'gini',max_depth = 5, min_samples_split= 5, min_samples_leaf= 2)

# Train the model on the data
clf.fit(X_dense, y_train_single_label)

# Use the model for prediction
predictions = clf.predict(X_val_dense)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_tree = [label_mapping[pred] for pred in predictions]


# Calculate metrics
accuracy_tree = accuracy_score(y_val, y_pred_tree)
precision_tree = precision_score(y_val, y_pred_tree, average='weighted')
recall_tree = recall_score(y_val, y_pred_tree, average='weighted')
f1_tree = f1_score(y_val, y_pred_tree, average='weighted')
conf_matrix_tree = confusion_matrix(y_val, y_pred_tree)
class_report_tree = classification_report(y_val, y_pred_tree)

# Print metrics
print(f'Accuracy: {accuracy_tree}')
print(f'Precision: {precision_tree}')
print(f'Recall: {recall_tree}')
print(f'F1 Score: {f1_tree}')
print(f'Confusion Matrix:\n{conf_matrix_tree}')
print(f'Classification Report:\n{class_report_tree}')

In [None]:
# Create a decision tree classifier with Feature Engineering(PCA)

clf_PCA = DecisionTreeClassifier()

# Train the model on the data
clf_PCA.fit(X_train_pca, y_train)

# Use the model for prediction
predictions = clf_PCA.predict(X_val_pca)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_tree = [label_mapping[pred] for pred in predictions]


# Calculate metrics
accuracy_tree_pca = accuracy_score(y_val, y_pred_tree)
precision_tree_pca = precision_score(y_val, y_pred_tree, average='weighted')
recall_tree_pca = recall_score(y_val, y_pred_tree, average='weighted')
f1_tree_pca = f1_score(y_val, y_pred_tree, average='weighted')
conf_matrix_tree_pca = confusion_matrix(y_val, y_pred_tree)
class_report_tree_pca = classification_report(y_val, y_pred_tree)

# Print metrics
print(f'Accuracy: {accuracy_tree_pca}')
print(f'Precision: {precision_tree_pca}')
print(f'Recall: {recall_tree_pca}')
print(f'F1 Score: {f1_tree_pca}')
print(f'Confusion Matrix:\n{conf_matrix_tree_pca}')
print(f'Classification Report:\n{class_report_tree_pca}')

In [None]:
# Create a decision tree classifier with Feature Engineering(SelectKBest)

clf_Sel = DecisionTreeClassifier()

# Train the model on the data
clf_Sel.fit(X_train_selected, y_train)

# Use the model for prediction
predictions = clf_Sel.predict(X_val_selected)

# Assign prediction to y_pred_tree
y_pred_tree = predictions

# Calculate metrics
accuracy_tree_val = accuracy_score(y_val, y_pred_tree)
precision_tree_val = precision_score(y_val, y_pred_tree, average='weighted')
recall_tree_val = recall_score(y_val, y_pred_tree, average='weighted')
f1_tree_val = f1_score(y_val, y_pred_tree, average='weighted')
conf_matrix_tree_val = confusion_matrix(y_val, y_pred_tree)
class_report_tree_val = classification_report(y_val, y_pred_tree)

# Print metrics
print(f'Accuracy: {accuracy_tree_val}')
print(f'Precision: {precision_tree_val}')
print(f'Recall: {recall_tree_val}')
print(f'F1 Score: {f1_tree_val}')
print(f'Confusion Matrix:\n{conf_matrix_tree_val}')
print(f'Classification Report:\n{class_report_tree_val}')


---

# Bagging Classifier Model

In [None]:
# Create a Bagging Classifier(RandomForest) without Feature Engineering

class BaseModel(BaseEstimator):
    def fit(self, X_train, y_train_single_label):

        pass

    def predict(self, X_train):

        pass

base_estimator = DecisionTreeClassifier()

n_estimators = 100

bagging_model = BaggingClassifier(base_estimator=base_estimator, n_estimators=n_estimators)

bagging_model.fit(X_train, y_train_single_label)

predictions_bagging = bagging_model.predict(X_val)

# Get unique original labels (replace y_train with your original label array)
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_bagging = [label_mapping[pred] for pred in predictions_bagging]

# Bagging Valiadation Metrics

accuracy_bag = accuracy_score(y_val, y_pred_bagging)
precision_bag = precision_score(y_val, y_pred_bagging, average='weighted')
recall_bag = recall_score(y_val, y_pred_bagging, average='weighted')
f1_bag = f1_score(y_val, y_pred_bagging, average='weighted')
cm_bag = confusion_matrix(y_val, y_pred_bagging)
roc_auc_bag = roc_auc_score(y_val, bagging_model.predict_proba(X_val), multi_class='ovr')

print(f"Accuracy: {accuracy_bag:.4f}")
print(f"Precision: {precision_bag:.4f}")
print(f"Recall: {recall_bag:.4f}")
print(f"F1 Score: {f1_bag:.4f}")
print("Confusion Matrix:")
print(cm_bag)
print(f"ROC-AUC: {roc_auc_bag:.4f}")

In [None]:
# Create a Bagging Classifier with Feature Engineering(PCA)

class BaseModel(BaseEstimator):
    def fit(self, X_train_pca, y_train_single_label):

        pass

    def predict(self, X_train_pca):

        pass

base_estimator = DecisionTreeClassifier()

n_estimators = 100

bagging_model = BaggingClassifier(base_estimator=base_estimator, n_estimators=n_estimators)

bagging_model.fit(X_train_pca, y_train_single_label)

predictions_bagging = bagging_model.predict(X_val_pca)

# Get unique original labels (replace y_train with your original label array)
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_bagging = [label_mapping[pred] for pred in predictions_bagging]

# Bagging Valiadation Metrics

accuracy_bagging = accuracy_score(y_val, y_pred_bagging)
precision_bagging = precision_score(y_val, y_pred_bagging, average='weighted')
recall_bagging = recall_score(y_val, y_pred_bagging, average='weighted')
f1_bagging = f1_score(y_val, y_pred_bagging, average='weighted')
cm_bagging = confusion_matrix(y_val, y_pred_bagging)
roc_auc_bagging = roc_auc_score(y_val, bagging_model.predict_proba(X_val_pca), multi_class='ovr')

print(f"Accuracy: {accuracy_bagging:.4f}")
print(f"Precision: {precision_bagging:.4f}")
print(f"Recall: {recall_bagging:.4f}")
print(f"F1 Score: {f1_bagging:.4f}")
print("Confusion Matrix:")
print(cm_bagging)
print(f"ROC-AUC: {roc_auc_bagging:.4f}")

In [None]:
# Create a Bagging Classifier with Feature Engineering (SelectKBest)

class BaseModel(BaseEstimator):
    def fit(self, X_train_selected, y_train_single_label):

        pass

    def predict(self, X_train_selected):

        pass

base_estimator = DecisionTreeClassifier()

n_estimators = 100

bagging_model = BaggingClassifier(base_estimator=base_estimator, n_estimators=n_estimators)

bagging_model.fit(X_train_selected, y_train_single_label)

predictions_bagging = bagging_model.predict(X_val_selected)

# Get unique original labels (replace y_train with your original label array)
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_bagging = [label_mapping[pred] for pred in predictions_bagging]

# Bagging Valiadation Metrics

accuracy_bagging = accuracy_score(y_val, y_pred_bagging)
precision_bagging = precision_score(y_val, y_pred_bagging, average='weighted')
recall_bagging = recall_score(y_val, y_pred_bagging, average='weighted')
f1_bagging = f1_score(y_val, y_pred_bagging, average='weighted')
cm_bagging = confusion_matrix(y_val, y_pred_bagging)
roc_auc_bagging = roc_auc_score(y_val, bagging_model.predict_proba(X_val_selected), multi_class='ovr')

print(f"Accuracy: {accuracy_bagging:.4f}")
print(f"Precision: {precision_bagging:.4f}")
print(f"Recall: {recall_bagging:.4f}")
print(f"F1 Score: {f1_bagging:.4f}")
print("Confusion Matrix:")
print(cm_bagging)
print(f"ROC-AUC: {roc_auc_bagging:.4f}")

---

## Multi Layer Perceptron Classifier

In [None]:
# Create MLPClassifier without Feature Engineering

mlp = MLPClassifier(random_state=42)

mlp.set_params(hidden_layer_sizes=(100, 50), activation='relu', solver='adam')

mlp.fit(X_train, y_train_single_label)

predictions_mlp = mlp.predict(X_val)


# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_mlp = [label_mapping[pred] for pred in predictions_mlp]

# MLPClassifier Valiadation Metrics

accuracy_mlp1 = accuracy_score(y_val, y_pred_mlp)
precision_mlp = precision_score(y_val, y_pred_mlp, average='weighted')
recall_mlp = recall_score(y_val, y_pred_mlp, average='weighted')
f1_mlp = f1_score(y_val, y_pred_mlp, average='weighted')
cm_mlp = confusion_matrix(y_val, y_pred_mlp)
roc_auc_mlp = roc_auc_score(y_val, mlp.predict_proba(X_val), multi_class='ovr')

print(f"Accuracy: {accuracy_mlp1:.4f}")
print(f"Precision: {precision_mlp:.4f}")
print(f"Recall: {recall_mlp:.4f}")
print(f"F1 Score: {f1_mlp:.4f}")
print("Confusion Matrix:")
print(cm_mlp)
print(f"ROC-AUC: {roc_auc_mlp:.4f}")

In [None]:
# Create MLPClassifier with Feature Engineering(SelectKBest)

mlp = MLPClassifier(random_state=42)

mlp.set_params(hidden_layer_sizes=(100, 50), activation='relu', solver='adam')

mlp.fit(X_train_selected, y_train_single_label)

predictions_mlp = mlp.predict(X_val_selected)


# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_mlp = [label_mapping[pred] for pred in predictions_mlp]

# MLPClassifier Valiadation Metrics

accuracy_mlp = accuracy_score(y_val, y_pred_mlp)
precision_mlp = precision_score(y_val, y_pred_mlp, average='weighted')
recall_mlp = recall_score(y_val, y_pred_mlp, average='weighted')
f1_mlp = f1_score(y_val, y_pred_mlp, average='weighted')
cm_mlp = confusion_matrix(y_val, y_pred_mlp)
roc_auc_mlp = roc_auc_score(y_val, mlp.predict_proba(X_val_selected), multi_class='ovr')

print(f"Accuracy: {accuracy_mlp:.4f}")
print(f"Precision: {precision_mlp:.4f}")
print(f"Recall: {recall_mlp:.4f}")
print(f"F1 Score: {f1_mlp:.4f}")
print("Confusion Matrix:")
print(cm_mlp)
print(f"ROC-AUC: {roc_auc_mlp:.4f}")

In [None]:
# Create MLPClassifier with Feature Engineering(PCA)

mlp = MLPClassifier(random_state=42)

mlp.set_params(hidden_layer_sizes=(100, 50), activation='relu', solver='adam')

mlp.fit(X_train_pca, y_train_single_label)

predictions_mlp = mlp.predict(X_val_pca)


# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_mlp = [label_mapping[pred] for pred in predictions_mlp]

# MLPClassifier Valiadation Metrics

accuracy_mlp = accuracy_score(y_val, y_pred_mlp)
precision_mlp = precision_score(y_val, y_pred_mlp, average='weighted')
recall_mlp = recall_score(y_val, y_pred_mlp, average='weighted')
f1_mlp = f1_score(y_val, y_pred_mlp, average='weighted')
cm_mlp = confusion_matrix(y_val, y_pred_mlp)
roc_auc_mlp = roc_auc_score(y_val, mlp.predict_proba(X_val_pca), multi_class='ovr')

print(f"Accuracy: {accuracy_mlp:.4f}")
print(f"Precision: {precision_mlp:.4f}")
print(f"Recall: {recall_mlp:.4f}")
print(f"F1 Score: {f1_mlp:.4f}")
print("Confusion Matrix:")
print(cm_mlp)
print(f"ROC-AUC: {roc_auc_mlp:.4f}")

---
# Gradient Boosting Classifier

### Hyper Parameter Tuning for Gradient Boosting Classifier

In [None]:
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(200, 300),
    'learning_rate': [0.05, 0.01],
    'max_depth': randint(4, 6)
}

GBCmodel = GradientBoostingClassifier()
GBCrandom_search = RandomizedSearchCV(estimator=GBCmodel, param_distributions=param_dist, n_iter=3, cv=5, scoring='accuracy')
GBCrandom_search.fit(X_train_selected, y_train_single_label)


# Get the best model and parameters for Gradient Boosting Class
GBCbest_model = GBCrandom_search.best_estimator_
GBCbest_params = GBCrandom_search.best_params_
GBCbest_score = GBCrandom_search.best_score_

print(GBCbest_model)
print(GBCbest_params)
print(GBCbest_score)

**Result** :

After doing RandomSearchCV these were the best parameters obtained.


GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=273)


{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 273}


## Gradient Boosting Classifier Model for Validation

In [None]:
# Gradient Boosting without Feature Engineering

best_gbc = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=273,random_state=42)

best_gbc.fit(X_train, y_train_single_label)

# Use the model for prediction
predictions_GBC = best_gbc.predict(X_val)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_GBC = [label_mapping[pred] for pred in predictions_GBC]

# calculating Validation Metrics for GradientBoostingClassifier

GBCaccuracy = accuracy_score(y_val_single_label, predictions_GBC)
GBCprecision = precision_score(y_val_single_label, predictions_GBC,average='weighted')
GBCrecall = recall_score(y_val_single_label, predictions_GBC,average='weighted')
GBCf1 = f1_score(y_val_single_label, predictions_GBC,average='weighted')
GBCconf_matrix = confusion_matrix(y_val_single_label, predictions_GBC)
cm_GBC = confusion_matrix(y_val, y_pred_GBC)
roc_auc_GBC = roc_auc_score(y_val, best_gbc.predict_proba(X_val), multi_class='ovr')

print("Accuracy:", GBCaccuracy)
print("Precision:", GBCprecision)
print("Recall:", GBCrecall)
print("F1-score:", GBCf1)
print("Confusion Matrix:\n", GBCconf_matrix)
print(cm_GBC)
print(f"ROC-AUC: {roc_auc_GBC:.4f}")

In [None]:
# Gradient Boosting with Feature Engineering (SelectKBest)

best_gbc = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=273,random_state=42)

best_gbc.fit(X_train_selected, y_train_single_label)

# Use the model for prediction
predictions_GBC = best_gbc.predict(X_val_selected)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_GBC = [label_mapping[pred] for pred in predictions_GBC]

# calculating Validation Metrics for GradientBoostingClassifier

GBCaccuracy = accuracy_score(y_val_single_label, predictions_GBC)
GBCprecision = precision_score(y_val_single_label, predictions_GBC,average='weighted')
GBCrecall = recall_score(y_val_single_label, predictions_GBC,average='weighted')
GBCf1 = f1_score(y_val_single_label, predictions_GBC,average='weighted')
GBCconf_matrix = confusion_matrix(y_val_single_label, predictions_GBC)
cm_GBC = confusion_matrix(y_val, y_pred_GBC)
roc_auc_GBC = roc_auc_score(y_val, best_gbc.predict_proba(X_val_selected), multi_class='ovr')

print("Accuracy:", GBCaccuracy)
print("Precision:", GBCprecision)
print("Recall:", GBCrecall)
print("F1-score:", GBCf1)
print("Confusion Matrix:\n", GBCconf_matrix)
print(cm_GBC)
print(f"ROC-AUC: {roc_auc_GBC:.4f}")

In [None]:
# Gradient Boosting with Feature Engineering(PCA)

best_gbc = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=273,random_state=42)

best_gbc.fit(X_train_pca, y_train_single_label)

# Use the model for prediction
predictions_GBC = best_gbc.predict(X_val_pca)

# Get unique original labels
original_labels = np.unique(y_train)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_GBC = [label_mapping[pred] for pred in predictions_GBC]

# calculating Validation Metrics for GradientBoostingClassifier

GBCaccuracy = accuracy_score(y_val_single_label, predictions_GBC)
GBCprecision = precision_score(y_val_single_label, predictions_GBC,average='weighted')
GBCrecall = recall_score(y_val_single_label, predictions_GBC,average='weighted')
GBCf1 = f1_score(y_val_single_label, predictions_GBC,average='weighted')
GBCconf_matrix = confusion_matrix(y_val_single_label, predictions_GBC)
cm_GBC = confusion_matrix(y_val, y_pred_GBC)
roc_auc_GBC = roc_auc_score(y_val, best_gbc.predict_proba(X_val_pca), multi_class='ovr')

print("Accuracy:", GBCaccuracy)
print("Precision:", GBCprecision)
print("Recall:", GBCrecall)
print("F1-score:", GBCf1)
print("Confusion Matrix:\n", GBCconf_matrix)
print(cm_GBC)
print(f"ROC-AUC: {roc_auc_GBC:.4f}")

----------------------------------

In [None]:
# Sample model names and performance metrics (accuracy)
model_names = ["Logistic Reg", "KNN", "Decision Tree", "Random Forest", "MLPC","Gradient Boosting"]
accuracy_scores = [accuracy_Log,accuracy_knn,accuracy_tree,accuracy_bag,accuracy_mlp1,GBCaccuracy]

# Create a bar chart
plt.figure(figsize=(8, 6))
plt.barh(model_names, accuracy_scores, color=['blue', 'green', 'orange', 'red','purple','yellow'])
plt.xlabel("Model Name")
plt.ylabel("Accuracy Score")
plt.title("Model Performance Comparison")
plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for better readability

# Display the chart
plt.tight_layout()
plt.show()

## Gradient Boosting Classifier Model (Best Model submitted for Predictions)

In [None]:
# Gradient Boosting Classifier Model run with train and test datasets and submitted the predictions to competition

best_gbc = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=273,random_state=42)

best_gbc.fit(train_encoded, y_single_label)

# Use the model for prediction
predictions_GBC = best_gbc.predict(test_encoded)

# Get unique original labels (replace y_train with your original label array)
original_labels = np.unique(y)

# Create mapping
label_mapping = {i: label for i, label in enumerate(original_labels)}

# Convert predicted indices to original labels
y_pred_GBC = [label_mapping[pred] for pred in predictions_GBC]

#How to make a Submission this is the code one should type
submission = pd.DataFrame({"ID":np.arange(1,5001),
                           "Crime_Category": y_pred_GBC,
                            })
submission.to_csv('/kaggle/working/submission.csv',index = False)
