In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv
/kaggle/input/crime-cast-forecasting-crime-categories/train.csv
/kaggle/input/crime-cast-forecasting-crime-categories/test.csv


<h1 style="color:purple;">Crime Category Prediction Challenge</h1>

## Introduction
This project aims to develop predictive models capable of accurately predicting crime categories based on various attributes related to criminal activities. The dataset provides a comprehensive snapshot of criminal activities within the city, including details such as incident locations, victim demographics, and more. By leveraging machine learning techniques, the goal is to enhance law enforcement strategies and bolster public safety measures.

## Dataset Description
The dataset consists of the following files:
- **train.csv**: The training set, inclusive of the target variable 'crime_category' and relevant feature attributes.
  - Shape: (20000, 22)
- **test.csv**: The test set, containing similar feature attributes but excluding the target variable 'crime_category'.
  - Shape: (5000, 21)
- **sample_submission.csv**: A sample submission file provided in the correct format for competition submissions.

### Features in the Dataset
- **Location**: Street address of the crime incident.
- **Cross_Street**: Cross street of the rounded address.
- **Latitude**: Latitude coordinates of the crime incident.
- **Longitude**: Longitude coordinates of the crime incident.
- **Date_Reported**: Date the incident was reported.
- **Date_Occurred**: Date the incident occurred.
- **Time_Occurred**: Time the incident occurred in 24-hour military time.
- **Area_ID**: LAPD's Geographic Area number.
- **Area_Name**: Name designation of the LAPD Geographic Area.
- **Reporting_District_no**: Reporting district number.
- **Part 1-2**: Crime classification.
- **Modus_Operandi**: Activities associated with the suspect.
- **Victim_Age**: Age of the victim.
- **Victim_Sex**: Gender of the victim.
- **Victim_Descent**: Descent code of the victim.
- **Premise_Code**: Premise code indicating the location of the crime.
- **Premise_Description**: Description of the premise code.
- **Weapon_Used_Code**: Weapon code indicating the type of weapon used.
- **Weapon_Description**: Description of the weapon code.
- **Status**: Status of the case.
- **Status_Description**: Description of the status code.
- **Crime_Category**: The category of the crime (Target Variable, only in train.csv)

## Libraries Used
The following libraries were utilized in this project:
- **NumPy**: For numerical operations and handling arrays.
- **Pandas**: For data manipulation and analysis.
- **Scikit-learn**: For machine learning algorithms and model evaluation.
- **XGBoost**: For gradient boosting algorithms.
- **LightGBM**: For gradient boosting algorithms.
- **Imblearn**: For handling imbalanced datasets.
- **SciPy**: For scientific computations.
- **Seaborn**: For statistical data visualization.
- **Matplotlib**: For plotting and data visualization.



<h2 style="color:purple;">Required Libraries Import</h2>

In [None]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from scipy.stats import randint, uniform
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score






<h2 style="color:purple;">Data Loading And Data Exploration</h2>

In [None]:
#load the train data
train_df = pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/train.csv")
# Load the test data from the specified CSV file
test_df = pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/test.csv')
# Clean column names to remove spaces for all datasets
test_df.columns = test_df.columns.str.replace(' ', '')
train_df.columns = train_df.columns.str.replace(' ', '')

In [None]:
#exploring some rows of data
train_df.head()

Unnamed: 0,Location,Cross_Street,Latitude,Longitude,Date_Reported,Date_Occurred,Time_Occurred,Area_ID,Area_Name,Reporting_District_no,...,Victim_Age,Victim_Sex,Victim_Descent,Premise_Code,Premise_Description,Weapon_Used_Code,Weapon_Description,Status,Status_Description,Crime_Category
0,4500 CARPENTER AV,,34.1522,-118.391,03/09/2020 12:00:00 AM,03/06/2020 12:00:00 AM,1800.0,15.0,N Hollywood,1563.0,...,75.0,M,W,101.0,STREET,,,IC,Invest Cont,Property Crimes
1,45TH ST,ALAMEDA ST,34.0028,-118.2391,02/27/2020 12:00:00 AM,02/27/2020 12:00:00 AM,1345.0,13.0,Newton,1367.0,...,41.0,M,H,216.0,SWAP MEET,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,Property Crimes
2,600 E MARTIN LUTHER KING JR BL,,34.0111,-118.2653,08/21/2020 12:00:00 AM,08/21/2020 12:00:00 AM,605.0,13.0,Newton,1343.0,...,67.0,M,B,501.0,SINGLE FAMILY DWELLING,,,IC,Invest Cont,Property Crimes
3,14900 ORO GRANDE ST,,34.2953,-118.459,11/08/2020 12:00:00 AM,11/06/2020 12:00:00 AM,1800.0,19.0,Mission,1924.0,...,61.0,M,H,101.0,STREET,,,IC,Invest Cont,Property Crimes
4,7100 S VERMONT AV,,33.9787,-118.2918,02/25/2020 12:00:00 AM,02/25/2020 12:00:00 AM,1130.0,12.0,77th Street,1245.0,...,0.0,X,X,401.0,MINI-MART,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,Property Crimes


In [None]:
#checking the shape of the train data
train_df.shape

(20000, 22)

In [None]:
# Print concise summary of the DataFrame including column names, non-null values, and data types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Location               20000 non-null  object 
 1   Cross_Street           3448 non-null   object 
 2   Latitude               20000 non-null  float64
 3   Longitude              20000 non-null  float64
 4   Date_Reported          20000 non-null  object 
 5   Date_Occurred          20000 non-null  object 
 6   Time_Occurred          20000 non-null  float64
 7   Area_ID                20000 non-null  float64
 8   Area_Name              20000 non-null  object 
 9   Reporting_District_no  20000 non-null  float64
 10  Part1-2                20000 non-null  float64
 11  Modus_Operandi         17259 non-null  object 
 12  Victim_Age             20000 non-null  float64
 13  Victim_Sex             17376 non-null  object 
 14  Victim_Descent         17376 non-null  object 
 15  Pr

In [None]:
# getting column wise count of null values
print(train_df.isnull().sum())

Location                     0
Cross_Street             16552
Latitude                     0
Longitude                    0
Date_Reported                0
Date_Occurred                0
Time_Occurred                0
Area_ID                      0
Area_Name                    0
Reporting_District_no        0
Part1-2                      0
Modus_Operandi            2741
Victim_Age                   0
Victim_Sex                2624
Victim_Descent            2624
Premise_Code                 0
Premise_Description          5
Weapon_Used_Code         12665
Weapon_Description       12665
Status                       0
Status_Description           0
Crime_Category               0
dtype: int64


In [None]:
#explore the first few rows of test data
test_df.head()

Unnamed: 0,Location,Cross_Street,Latitude,Longitude,Date_Reported,Date_Occurred,Time_Occurred,Area_ID,Area_Name,Reporting_District_no,...,Modus_Operandi,Victim_Age,Victim_Sex,Victim_Descent,Premise_Code,Premise_Description,Weapon_Used_Code,Weapon_Description,Status,Status_Description
0,1500 LEIGHTON AV,,34.0128,-118.3045,03/03/2020 12:00:00 AM,03/03/2020 12:00:00 AM,2000.0,3.0,Southwest,376.0,...,0416 1241 1243 1813 1821 2000,28.0,F,H,501.0,SINGLE FAMILY DWELLING,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont
1,100 S NORMANDIE AV,,34.0726,-118.3029,06/01/2020 12:00:00 AM,04/25/2020 12:00:00 AM,1700.0,20.0,Olympic,2014.0,...,0344 0394,26.0,M,B,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,,IC,Invest Cont
2,300 E 111TH ST,,33.9348,-118.2695,08/28/2020 12:00:00 AM,08/27/2020 12:00:00 AM,900.0,18.0,Southeast,1844.0,...,1822 0701 1914 0355 1202 0100,62.0,F,B,721.0,HIGH SCHOOL,,,IC,Invest Cont
3,1300 S LA BREA AV,,34.0497,-118.3442,12/23/2020 12:00:00 AM,12/03/2020 12:00:00 AM,2200.0,7.0,Wilshire,765.0,...,,0.0,,,108.0,PARKING LOT,,,IC,Invest Cont
4,11000 MORRISON ST,,34.1611,-118.3704,08/30/2020 12:00:00 AM,08/29/2020 12:00:00 AM,130.0,15.0,N Hollywood,1555.0,...,1501,37.0,F,W,501.0,SINGLE FAMILY DWELLING,,,AO,Adult Other


In [None]:
#checking the shape of the test data
test_df.shape

(5000, 21)

In [None]:
# Print concise summary of the test DataFrame including column names, non-null values, and data types
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Location               5000 non-null   object 
 1   Cross_Street           810 non-null    object 
 2   Latitude               5000 non-null   float64
 3   Longitude              5000 non-null   float64
 4   Date_Reported          5000 non-null   object 
 5   Date_Occurred          5000 non-null   object 
 6   Time_Occurred          5000 non-null   float64
 7   Area_ID                5000 non-null   float64
 8   Area_Name              5000 non-null   object 
 9   Reporting_District_no  5000 non-null   float64
 10  Part1-2                5000 non-null   float64
 11  Modus_Operandi         4316 non-null   object 
 12  Victim_Age             5000 non-null   float64
 13  Victim_Sex             4357 non-null   object 
 14  Victim_Descent         4357 non-null   object 
 15  Prem

In [None]:
# getting column wise count of null values
print(test_df.isnull().sum())

Location                    0
Cross_Street             4190
Latitude                    0
Longitude                   0
Date_Reported               0
Date_Occurred               0
Time_Occurred               0
Area_ID                     0
Area_Name                   0
Reporting_District_no       0
Part1-2                     0
Modus_Operandi            684
Victim_Age                  0
Victim_Sex                643
Victim_Descent            643
Premise_Code                0
Premise_Description         1
Weapon_Used_Code         3153
Weapon_Description       3153
Status                      0
Status_Description          0
dtype: int64


<h2 style="color:purple;">Feature Engineering</h2>

In [None]:
#feature engineering function to deal with the formats of columns.
def feature_engineering(df):
    date_cols = ['Date_Reported', 'Date_Occurred']

    for date_col in date_cols:
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        df[f'{date_col}_Year'] = df[date_col].dt.year
        df[f'{date_col}_Month'] = df[date_col].dt.month
        df[f'{date_col}_Day'] = df[date_col].dt.day

    df = df.drop(columns=date_cols)
    return df
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

<h2 style="color:purple;">Distribution of Target Variable</h2>

In [None]:
'''
# Plot a count plot for the 'Crime_Category' column to show the frequency of each category
sns.countplot(x='Crime_Category', data=train_df)
# Rotate the x-axis labels by 90 degrees for better readability
plt.xticks(rotation=90)
plt.show()
'''


"\n# Plot a count plot for the 'Crime_Category' column to show the frequency of each category\nsns.countplot(x='Crime_Category', data=train_df)\n# Rotate the x-axis labels by 90 degrees for better readability\nplt.xticks(rotation=90)\nplt.show()\n"

**As we can see, the classes are imbalanced for target column. We will use some sampling techniques to deal with this imbalance and check if these techniques improve the accuracy further after fitting the models without these techniques being applied.**

<h2 style="color:purple;">Exploratory Data Analysis</h2>

In [None]:
# Define the target column
target_col = 'Crime_Category'
train_df['Part1-2'] = train_df['Part1-2'].astype('category')
test_df['Part1-2'] = test_df['Part1-2'].astype('category')
# Identify feature types
numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = train_df.select_dtypes(include=['object','category']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != target_col]
categorical_cols = [col for col in categorical_cols if col != target_col]
# Print identified columns
print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

Numerical columns: ['Latitude', 'Longitude', 'Time_Occurred', 'Area_ID', 'Reporting_District_no', 'Victim_Age', 'Premise_Code', 'Weapon_Used_Code']
Categorical columns: ['Location', 'Cross_Street', 'Area_Name', 'Part1-2', 'Modus_Operandi', 'Victim_Sex', 'Victim_Descent', 'Premise_Description', 'Weapon_Description', 'Status', 'Status_Description']


In [None]:
def perform_eda1(df, target_col):
    # Summary statistics
    print("Summary Statistics for Numerical Features:")
    print(df[numerical_cols].describe())
    print("\nSummary Statistics for Categorical Features:")
    print(df[categorical_cols].describe())
def perform_eda2(df, target_col):
    # Distribution plots for numerical features
    n_numerical = len(numerical_cols)
    plt.figure(figsize=(20, 5 * (n_numerical // 3 + 1)))
    for i, col in enumerate(numerical_cols):
        plt.subplot(n_numerical // 3 + 1, 3, i + 1)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()
def perform_eda3(df, target_col):
    # Count plots for categorical features
    n_categorical = len(categorical_cols)
    plt.figure(figsize=(20, 5 * (n_categorical // 3 + 1)))
    for i, col in enumerate(categorical_cols):
        plt.subplot(n_categorical // 3 + 1, 3, i + 1)
        sns.countplot(x=col, data=df)
        plt.title(f'Count of {col}')
        plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
def perform_eda4(df, target_col):
    # Correlation matrix for numerical features
    plt.figure(figsize=(12, 8))
    corr_matrix = df[numerical_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()
def perform_eda5(df, target_col):
    # Scatter plots for numerical features against the target variable
    n_numerical = len(numerical_cols)
    plt.figure(figsize=(20, 5 * (n_numerical // 3 + 1)))
    for i, col in enumerate(numerical_cols):
        plt.subplot(n_numerical // 3 + 1, 3, i + 1)
        sns.scatterplot(x=col, y=target_col, data=df)
        plt.title(f'{col} vs {target_col}')
    plt.tight_layout()
    plt.show()
def perform_eda6(df, target_col):
    # Count plots for categorical features against the target variable
    n_categorical = len(categorical_cols)
    plt.figure(figsize=(20, 5 * (n_categorical // 3 + 1)))
    for i, col in enumerate(categorical_cols):
        plt.subplot(n_categorical // 3 + 1, 3, i + 1)
        sns.countplot(x=col, hue=target_col, data=df)
        plt.title(f'{col} vs {target_col}')
        plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

In [None]:
'''
# Call the function
perform_eda1(train_df, 'Crime_Category')
'''

"\n# Call the function\nperform_eda1(train_df, 'Crime_Category')\n"

In [None]:
'''
# Distribution Of Numerical Features
perform_eda2(train_df, 'Crime_Category')
'''

"\n# Distribution Of Numerical Features \nperform_eda2(train_df, 'Crime_Category')\n"

### Distribution of Numerical Features

#### Distribution of Latitude
- **Observation**: The majority of latitude values are concentrated between 34 and 35.
- **Insight**: This suggests that most of the crimes are occurring within a narrow range of latitude, indicating a specific geographic concentration.

#### Distribution of Longitude
- **Observation**: The majority of longitude values are concentrated around -118.
- **Insight**: Similar to latitude, this indicates that the crimes are geographically concentrated in a specific region.

#### Distribution of Time_Occurred
- **Observation**: The distribution of the time occurred is relatively uniform, with some peaks around early morning (0-600), noon (1200-1400), and late evening (1800-2400).
- **Insight**: This could indicate that crimes occur throughout the day but have specific peaks, possibly correlating with times when people are more active or when there is less law enforcement presence.

#### Distribution of Area_ID
- **Observation**: The distribution is fairly uniform across different area IDs, with a few areas showing slightly higher crime counts.
- **Insight**: This suggests that crimes are spread out across various geographic areas, with certain areas experiencing higher crime rates.

#### Distribution of Reporting_District_no
- **Observation**: The distribution shows a somewhat uniform spread with some fluctuations.
- **Insight**: This indicates that the reporting of crimes is distributed across different districts, with no single district overwhelmingly reporting more crimes.

#### Distribution of Victim_Age
- **Observation**: The distribution is right-skewed, with a large number of victims being very young (age 0) and another peak around age 20-30.
- **Insight**: This could suggest that a significant number of victims are very young (possibly indicating child-related crimes), and another large group of victims are young adults.

#### Distribution of Premise_Code
- **Observation**: There are several distinct peaks in the distribution, indicating the presence of certain types of premises where crimes are more common.
- **Insight**: Certain premise types are more frequently associated with crimes. This information can be used to identify high-risk locations.

#### Distribution of Weapon_Used_Code
- **Observation**: The distribution shows distinct peaks at specific weapon codes.
- **Insight**: Certain types of weapons are more commonly used in crimes. This information is crucial for law enforcement agencies as it highlights the types of weapons that are more prevalent in criminal activities. Focused efforts on controlling these specific types of weapons could potentially reduce crime rates. Additionally, this data could inform public safety campaigns and policy decisions regarding weapon regulation.


In [None]:
'''
# Count Plots For Categorical Features
perform_eda3(train_df, 'Crime_Category')
'''

"\n# Count Plots For Categorical Features \nperform_eda3(train_df, 'Crime_Category')\n"

### Count Plots for Categorical Columns

#### 1. Count of Location
- **Observation**: The plot is highly dense and difficult to read due to the large number of unique locations.
- **Insight**: The data contains a vast number of unique locations where crimes have occurred. This indicates that crimes are widely distributed across many different locations, which could make it challenging to target specific areas for crime prevention.

#### 2. Count of Cross_Street
- **Observation**: Similar to the "Location" column, the "Cross_Street" column also has a large number of unique values.
- **Insight**: This suggests that crimes are spread across various cross streets, further indicating the widespread nature of criminal activities across the city.

#### 3. Count of Area_Name
- **Observation**: Some areas, like "N Hollywood" and "Newton," have higher crime counts compared to others.
- **Insight**: Certain areas experience more crimes than others, which could help in focusing law enforcement efforts on high-crime areas to improve safety and allocate resources more efficiently.

#### 4. Count of Part1-2
- **Observation**: The counts for Part 1 and Part 2 crimes show that Part 1 crimes are more frequent.
- **Insight**: Part 1 crimes (likely more serious offenses) are more common than Part 2 crimes. This distinction can help in understanding the severity and type of crimes occurring in the city.

#### 5. Count of Modus_Operandi
- **Observation**: The plot is very dense and difficult to interpret due to the large number of unique values.
- **Insight**: There is a wide variety of methods used in committing crimes, indicating the diverse nature of criminal activities. This could be useful for profiling and understanding different criminal behaviors.

#### 6. Count of Victim_Sex
- **Observation**: Males and females are the most common victims, with a small number of unknown or unspecified sexes.
- **Insight**: Both males and females are victims of crimes, but the distribution is fairly balanced. This suggests that crime prevention efforts need to address the safety of all genders.

#### 7. Count of Victim_Descent
- **Observation**: Certain descent groups, like "W" (White) and "H" (Hispanic), have higher counts compared to others.
- **Insight**: Specific demographic groups are more frequently victims of crimes. This information can help in tailoring community outreach and crime prevention programs to protect vulnerable populations.

#### 8. Count of Premise_Description
- **Observation**: The plot is highly dense with a large number of unique premises.
- **Insight**: Crimes occur in a wide variety of premises, indicating that no single type of location is immune to criminal activities. This diversity necessitates a broad approach to crime prevention across different types of premises.

#### 9. Count of Weapon_Description
- **Observation**: Some weapons, like "Strong-Arm" are more commonly used.
- **Insight**: Certain weapons are frequently used in crimes. Focusing on controlling these weapons could help reduce crime rates.

#### 10. Count of Status
- **Observation**: Most cases are marked as "IC" (Investigation Continuing).
- **Insight**: A large number of cases remain under investigation, indicating a potential backlog in case processing or challenges in solving crimes.

#### 11. Count of Status_Description
- **Observation**: Similar to "Status," most cases fall under "Invest Cont" (Investigation Continuing).
- **Insight**: There is a significant proportion of cases that are still being investigated, which may point to resource constraints or complexities in solving crimes.


In [None]:
'''
#Correlation matrix for Numerical Features
perform_eda4(train_df, 'Crime_Category')
'''

"\n#Correlation matrix for Numerical Features \nperform_eda4(train_df, 'Crime_Category')\n"

### Correlation Matrix for Numerical Features

- **Latitude and Longitude**:
  - **Observation**: There is a strong negative correlation between latitude and longitude (-1.00).
  - **Insight**: This is expected due to the geographic layout, as changes in latitude and longitude are inherently linked.

- **Time_Occurred**:
  - **Observation**: There are very low correlations between `Time_Occurred` and all other features.
  - **Insight**: The time at which a crime occurs does not seem to have a strong relationship with other numerical features.

- **Area_ID and Reporting_District_no**:
  - **Observation**: These features show a perfect correlation (1.00) with each other.
  - **Insight**: This indicates that `Area_ID` and `Reporting_District_no` might be representing the same information. One of these columns could be redundant.

- **Victim_Age**:
  - **Observation**: `Victim_Age` shows a low correlation with all other features, the highest being with `Premise_Code` (0.19).
  - **Insight**: The age of the victim does not strongly correlate with other numerical features, suggesting that it might provide unique information.

- **Premise_Code and Weapon_Used_Code**:
  - **Observation**: There is a moderate correlation (0.20) between `Premise_Code` and `Weapon_Used_Code`.
  - **Insight**: Certain premises might be associated with specific types of weapons used in crimes.

- **Area_ID, Reporting_District_no, and Premise_Code**:
  - **Observation**: `Area_ID` and `Reporting_District_no` have very low correlations with `Premise_Code` and other features.
  - **Insight**: The area and district in which crimes occur do not strongly correlate with other numerical features, suggesting a more complex interplay of factors influencing crime locations.

- **Weapon_Used_Code**:
  - **Observation**: `Weapon_Used_Code` shows a moderate correlation with `Premise_Code` (0.20).
  - **Insight**: The type of weapon used might be somewhat related to the premise where the crime occurs, indicating that certain environments may be more prone to specific types of weapon usage.




In [None]:
'''
#Scatter Plots for Numerical Features against Crime_Category
perform_eda5(train_df, 'Crime_Category')
'''

"\n#Scatter Plots for Numerical Features against Crime_Category\nperform_eda5(train_df, 'Crime_Category')\n"

### Scatter Plots of Numerical Columns Against Crime_Category

#### Latitude vs Crime_Category
- **Observation**: Crimes are fairly evenly distributed across the range of latitude values.
- **Insight**: There is no strong relationship between latitude and the type of crime, indicating that crime categories are spread out across different latitudes.

#### Longitude vs Crime_Category
- **Observation**: Similar to latitude, crimes are evenly distributed across the range of longitude values.
- **Insight**: There is no strong relationship between longitude and the type of crime, suggesting that crime categories are spread out across different longitudes.

#### Time_Occurred vs Crime_Category
- **Observation**: Crimes occur at various times of the day, with no clear pattern linking specific times to crime categories.
- **Insight**: Time of occurrence does not strongly differentiate between crime categories, indicating that crimes of all types can happen at any time.

#### Area_ID vs Crime_Category
- **Observation**: Crimes are evenly distributed across different Area_IDs.
- **Insight**: There is no strong relationship between Area_ID and the type of crime, suggesting that crime categories are spread across various geographic areas.

#### Reporting_District_no vs Crime_Category
- **Observation**: Crimes are distributed across different reporting districts, with no clear pattern linking specific districts to crime categories.
- **Insight**: Reporting district numbers do not show a strong relationship with crime categories, indicating that crimes of all types occur across different districts.

#### Victim_Age vs Crime_Category
- **Observation**: Victim age does not show a strong relationship with crime categories, though some categories like "Crimes against Persons" have younger victims.
- **Insight**: Most crime categories have a wide range of victim ages, with no particular age group being exclusively targeted for specific crime types.

#### Premise_Code vs Crime_Category
- **Observation**: Different crime categories are distributed across various premises, with no clear pattern.
- **Insight**: The type of premise does not strongly differentiate between crime categories, indicating that crimes of all types occur in various locations.

#### Weapon_Used_Code vs Crime_Category
- **Observation**: Different types of crimes involve a variety of weapons, with no clear pattern linking specific weapons to crime categories.
- **Insight**: The type of weapon used does not strongly differentiate between crime categories, suggesting that crimes of all types involve various weapons.



In [None]:
'''
#Count Plots of Categorical Columns against Crime_Category
perform_eda6(train_df, 'Crime_Category')
'''

"\n#Count Plots of Categorical Columns against Crime_Category\nperform_eda6(train_df, 'Crime_Category')\n"

### Count Plots of Categorical Columns Against Crime_Category

#### Location vs Crime_Category
- **Observation**: The distribution of crimes across different locations is very dense and appears uniform.
- **Insight**: This suggests that crimes of all types are spread across many locations, making it difficult to identify specific locations associated with particular crime categories.

#### Cross_Street vs Crime_Category
- **Observation**: Similar to the Location feature, crimes are spread across many cross streets with no clear pattern.
- **Insight**: Crimes are not concentrated on specific cross streets, indicating that the cross street information might not be a strong differentiator for crime categories.

#### Area_Name vs Crime_Category
- **Observation**: Different area names show some variation in the count of crimes, with some areas having higher crime counts.
- **Insight**: Certain areas, such as "N Hollywood" and "Newton", have higher counts of certain crime categories, which could indicate areas with higher crime rates.

#### Part1-2 vs Crime_Category
- **Observation**: Part 1 crimes are more frequent than Part 2 crimes across all crime categories.
- **Insight**: This indicates that Part 1 crimes, which are generally more severe, are more prevalent in the dataset.

#### Modus_Operandi vs Crime_Category
- **Observation**: The modus operandi feature shows a very dense distribution with no clear pattern.
- **Insight**: Modus operandi does not provide a clear differentiation between crime categories, suggesting that it might not be a strong predictor.

#### Victim_Sex vs Crime_Category
- **Observation**: Males (M) are more frequently victims across most crime categories, followed by females (F).
- **Insight**: This suggests that males are more often victims of crime, which could inform targeted prevention efforts.

#### Victim_Descent vs Crime_Category
- **Observation**: Certain descents, such as White (W) and Hispanic (H), show higher counts across various crime categories.
- **Insight**: This indicates that crimes are more frequently reported among certain descent groups, which could reflect demographic patterns or reporting biases.

#### Premise_Description vs Crime_Category
- **Observation**: Crimes occur across a wide variety of premises, with some premises showing higher counts for certain crime categories.
- **Insight**: Certain premises, like "STREET" and "SINGLE FAMILY DWELLING", have higher crime counts, suggesting that these locations are more prone to certain types of crimes.

#### Weapon_Description vs Crime_Category
- **Observation**: Certain weapons, like "STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)", are more commonly associated with specific crime categories.
- **Insight**: This highlights the prevalence of certain weapons in specific crimes, which could inform law enforcement and public safety strategies.

#### Status vs Crime_Category
- **Observation**: The majority of cases are "Invest Cont", indicating that most crimes are under investigation.
- **Insight**: The status of cases provides information on the progress and resolution of crime investigations.

#### Status_Description vs Crime_Category
- **Observation**: Similar to Status, most crimes are marked as "Invest Cont" across all categories.
- **Insight**: This suggests that most cases remain under investigation, indicating a need for resources to resolve these cases.



<h2 style="color:purple;">Data Splitting</h2>

In [None]:

'''
# Separate the features (X) from the target variable (y)
X = train_df.drop(columns=['Crime_Category'])
y = train_df['Crime_Category']
# Split the data into training and validation sets
# 80% of the data is used for training, 20% is used for validation
# random_state is set to 42 to ensure reproducibility
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
'''





"\n# Separate the features (X) from the target variable (y)\nX = train_df.drop(columns=['Crime_Category'])\ny = train_df['Crime_Category']\n# Split the data into training and validation sets\n# 80% of the data is used for training, 20% is used for validation\n# random_state is set to 42 to ensure reproducibility\nX_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n"

<h2 style="color:purple;">Data Preprocessing</h2>

In [None]:
'''
# Impute missing values for numerical columns
num_imputer = SimpleImputer(strategy='mean')
X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = num_imputer.transform(X_val[numerical_cols])
test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])
'''


"\n# Impute missing values for numerical columns\nnum_imputer = SimpleImputer(strategy='mean')\nX_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])\nX_val[numerical_cols] = num_imputer.transform(X_val[numerical_cols])\ntest_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])\n"

In [None]:
'''
# Impute missing values for categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
X_val[categorical_cols] = cat_imputer.transform(X_val[categorical_cols])
test_df[categorical_cols] = cat_imputer.transform(test_df[categorical_cols])
'''


"\n# Impute missing values for categorical columns\ncat_imputer = SimpleImputer(strategy='most_frequent')\nX_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])\nX_val[categorical_cols] = cat_imputer.transform(X_val[categorical_cols])\ntest_df[categorical_cols] = cat_imputer.transform(test_df[categorical_cols])\n"

In [None]:
'''
def frequency_encoding(column, df, df_val=None, df_test=None):
    freq_encoding = df[column].value_counts() / len(df)
    df[column + '_freq'] = df[column].map(freq_encoding)
    if df_val is not None:
        df_val[column + '_freq'] = df_val[column].map(freq_encoding)
    if df_test is not None:
        df_test[column + '_freq'] = df_test[column].map(freq_encoding)
    return df, df_val, df_test
high_cardinality_cols = ['Location', 'Cross_Street', 'Modus_Operandi', 'Premise_Description', 'Weapon_Description']
ordinal_encoder_cols = ['Area_Name', 'Part1-2', 'Victim_Sex', 'Victim_Descent', 'Status', 'Status_Description']
# Apply Frequency Encoding for high cardinality columns
for col in high_cardinality_cols:
    X_train, X_val, test_df = frequency_encoding(col, X_train, X_val, test_df)

# Apply Ordinal Encoder for other categorical columns
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[ordinal_encoder_cols] = oe.fit_transform(X_train[ordinal_encoder_cols])
X_val[ordinal_encoder_cols] = oe.transform(X_val[ordinal_encoder_cols])
test_df[ordinal_encoder_cols] = oe.transform(test_df[ordinal_encoder_cols])

# Drop the original high cardinality columns as we have their frequency encoded versions
X_train = X_train.drop(columns=high_cardinality_cols)
X_val = X_val.drop(columns=high_cardinality_cols)
test_df = test_df.drop(columns=high_cardinality_cols)

# Use SimpleImputer to fill any remaining missing values
imputer = SimpleImputer(strategy='most_frequent')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)
'''


"\ndef frequency_encoding(column, df, df_val=None, df_test=None):\n    freq_encoding = df[column].value_counts() / len(df)\n    df[column + '_freq'] = df[column].map(freq_encoding)\n    if df_val is not None:\n        df_val[column + '_freq'] = df_val[column].map(freq_encoding)\n    if df_test is not None:\n        df_test[column + '_freq'] = df_test[column].map(freq_encoding)\n    return df, df_val, df_test\nhigh_cardinality_cols = ['Location', 'Cross_Street', 'Modus_Operandi', 'Premise_Description', 'Weapon_Description']\nordinal_encoder_cols = ['Area_Name', 'Part1-2', 'Victim_Sex', 'Victim_Descent', 'Status', 'Status_Description']\n# Apply Frequency Encoding for high cardinality columns\nfor col in high_cardinality_cols:\n    X_train, X_val, test_df = frequency_encoding(col, X_train, X_val, test_df)\n\n# Apply Ordinal Encoder for other categorical columns\noe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)\nX_train[ordinal_encoder_cols] = oe.fit_transform(X_tr

In [None]:
'''
# Scale numerical data
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

#Encode the target column
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
'''


'\n# Scale numerical data\nscaler = StandardScaler()\nX_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])\nX_val[numerical_cols] = scaler.transform(X_val[numerical_cols])\ntest_df[numerical_cols] = scaler.transform(test_df[numerical_cols])\n\n#Encode the target column\nlabel_encoder = LabelEncoder() \ny_train_encoded = label_encoder.fit_transform(y_train) \ny_val_encoded = label_encoder.transform(y_val)\n'

<h2 style="color:purple;">Correlation of Features With Target Variable</h2>

In [None]:
'''
# Create a copy of X_train and add the encoded target column
X_train_copy = X_train.copy()
X_train_copy['Crime_Category_Code'] = y_train_encoded
# Calculate the correlation matrix including the encoded target column
correlation_matrix = X_train_copy.corr()
# Extract correlations with the encoded target column
correlation_with_target = correlation_matrix['Crime_Category_Code'].sort_values(ascending=False)
print("Correlation of all columns with the target column:")
print(correlation_with_target)
'''

'\n# Create a copy of X_train and add the encoded target column\nX_train_copy = X_train.copy()\nX_train_copy[\'Crime_Category_Code\'] = y_train_encoded\n# Calculate the correlation matrix including the encoded target column\ncorrelation_matrix = X_train_copy.corr()\n# Extract correlations with the encoded target column\ncorrelation_with_target = correlation_matrix[\'Crime_Category_Code\'].sort_values(ascending=False)\nprint("Correlation of all columns with the target column:")\nprint(correlation_with_target)\n'

<h3 style="color:purple;">Insights from the Correlation Analysis</h3>

* Highly Correlated Features:

 None of the features have a very high correlation with the target column Crime_Category_Code. The highest positive correlation is with Cross_Street (0.075), which is relatively low.

* Moderately Correlated Features:

 Cross_Street (0.075), Location (0.063), and Time_Occurred (0.041) show the highest positive correlations with Crime_Category_Code. Premise_Code (-0.180) and Part 1-2 (-0.264) show moderate negative correlations with the target variable.

* Weakly Correlated Features:

 Most features show very weak correlations (close to 0) with the target column. This suggests that these features may not be strong predictors of the crime category.


<h2 style="color:purple;">Baseline Model</h2>

In [None]:
'''
# Initialize the DummyClassifier with the strategy to always predict the most frequent class
dummy_clf = DummyClassifier(strategy="most_frequent")

# Fit the DummyClassifier on the training data
dummy_clf.fit(X_train, y_train)
# Make predictions on the validation set using the fitted DummyClassifier
y_pred = dummy_clf.predict(X_val)
# Calculate the accuracy of the DummyClassifier on the validation set
accuracy = accuracy_score(y_val, y_pred)

# Print the accuracy of the DummyClassifier
print(f"DummyClassifier Accuracy: {accuracy}")
# The accuracy_score on Validation set is 0.57575
'''

'\n# Initialize the DummyClassifier with the strategy to always predict the most frequent class\ndummy_clf = DummyClassifier(strategy="most_frequent")\n\n# Fit the DummyClassifier on the training data\ndummy_clf.fit(X_train, y_train)\n# Make predictions on the validation set using the fitted DummyClassifier\ny_pred = dummy_clf.predict(X_val)\n# Calculate the accuracy of the DummyClassifier on the validation set\naccuracy = accuracy_score(y_val, y_pred)\n\n# Print the accuracy of the DummyClassifier\nprint(f"DummyClassifier Accuracy: {accuracy}")\n# The accuracy_score on Validation set is 0.57575\n'

In [None]:
'''
# Make predictions on the test set using the fitted DummyClassifier
test_predictions = dummy_clf.predict(test_df)
# load the sample submission file to check the format of submission
sample = pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv")
sample.head()
# create the submission dataframe of desired format
submission = pd.DataFrame(columns=['ID','Crime_Category'])
# fill in the predictions made in the submission file
submission['ID'] = [i+1 for i in range(len(test_predictions))]
submission['Crime_Category'] = test_predictions
# create a csv file for submission
# first submission,base model,dummy classifier
submission.to_csv('submission.csv',index=False)
'''

'\n# Make predictions on the test set using the fitted DummyClassifier\ntest_predictions = dummy_clf.predict(test_df)\n# load the sample submission file to check the format of submission\nsample = pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv")\nsample.head()\n# create the submission dataframe of desired format\nsubmission = pd.DataFrame(columns=[\'ID\',\'Crime_Category\'])\n# fill in the predictions made in the submission file \nsubmission[\'ID\'] = [i+1 for i in range(len(test_predictions))]\nsubmission[\'Crime_Category\'] = test_predictions\n# create a csv file for submission\n# first submission,base model,dummy classifier \nsubmission.to_csv(\'submission.csv\',index=False)\n'

<h2 style="color:purple;">Logistic Regression Classifier</h2>

In [None]:
'''
#Initialise the LogisticRegression model.
model = LogisticRegression(max_iter=10000, solver='liblinear')
# Set up hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Norm used in penalization
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
# Fit the model with cross-validation
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)
#Best parameters for LogisticRegression are: {'C': 100, 'penalty': 'l2'}
'''

'\n#Initialise the LogisticRegression model.\nmodel = LogisticRegression(max_iter=10000, solver=\'liblinear\')\n# Set up hyperparameter tuning with GridSearchCV\nparam_grid = {\n    \'C\': [0.1, 1, 10, 100],  # Regularization strength\n    \'penalty\': [\'l1\', \'l2\'],  # Norm used in penalization\n}\ngrid_search = GridSearchCV(model, param_grid, cv=5, scoring=\'accuracy\')\n# Fit the model with cross-validation\ngrid_search.fit(X_train, y_train)\nbest_params = grid_search.best_params_\nprint("Best hyperparameters:", best_params)\n#Best parameters for LogisticRegression are: {\'C\': 100, \'penalty\': \'l2\'}\n'

In [None]:
'''
best_model_logr = LogisticRegression(max_iter=10000, solver='liblinear',C=100,penalty='l2')
# Fit the best model on the training data
best_model_logr.fit(X_train, y_train)
# Make predictions on the validation set
y_pred_logr_val = best_model_logr.predict(X_val)
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_logr_val)
print(f"Accuracy: {accuracy}")
# The accuracy_score on validation set is 0.65325

#other metrics to understand performance
# Precision
precision_logr = precision_score(y_val, y_pred_logr_val, average='weighted')
print(f"Precision: {precision_logr:.4f}")
# Recall
recall_logr = recall_score(y_val, y_pred_logr_val, average='weighted')
print(f"Recall: {recall_logr:.4f}")
# F1 Score
f1_logr = f1_score(y_val, y_pred_logr_val, average='weighted')
print(f"F1 Score: {f1_logr:.4f}")
#Precision: 0.6034 Recall: 0.6532 F1 Score: 0.5891
'''

'\nbest_model_logr = LogisticRegression(max_iter=10000, solver=\'liblinear\',C=100,penalty=\'l2\')\n# Fit the best model on the training data\nbest_model_logr.fit(X_train, y_train)\n# Make predictions on the validation set\ny_pred_logr_val = best_model_logr.predict(X_val)\n# Calculate accuracy\naccuracy = accuracy_score(y_val, y_pred_logr_val)\nprint(f"Accuracy: {accuracy}")\n# The accuracy_score on validation set is 0.65325\n\n#other metrics to understand performance\n# Precision\nprecision_logr = precision_score(y_val, y_pred_logr_val, average=\'weighted\')\nprint(f"Precision: {precision_logr:.4f}")\n# Recall\nrecall_logr = recall_score(y_val, y_pred_logr_val, average=\'weighted\')\nprint(f"Recall: {recall_logr:.4f}")\n# F1 Score\nf1_logr = f1_score(y_val, y_pred_logr_val, average=\'weighted\')\nprint(f"F1 Score: {f1_logr:.4f}")\n#Precision: 0.6034 Recall: 0.6532 F1 Score: 0.5891\n'

In [None]:
'''
# Make predictions on the test set using the fitted Logistic regression
test_predictions_log = best_model_logr.predict(test_df)
# create the submission dataframe of desired format
submission_log = pd.DataFrame(columns=['ID','Crime_Category'])
# fill in the predictions made in the submission file
submission_log['ID'] = [i+1 for i in range(len(test_predictions_log))]
submission_log['Crime_Category'] = test_predictions_log
# create a csv file for submission
# second submission,logistic regression classifier
submission_log.to_csv('submission_log.csv',index=False)
'''

"\n# Make predictions on the test set using the fitted Logistic regression\ntest_predictions_log = best_model_logr.predict(test_df)\n# create the submission dataframe of desired format\nsubmission_log = pd.DataFrame(columns=['ID','Crime_Category'])\n# fill in the predictions made in the submission file \nsubmission_log['ID'] = [i+1 for i in range(len(test_predictions_log))]\nsubmission_log['Crime_Category'] = test_predictions_log\n# create a csv file for submission\n# second submission,logistic regression classifier \nsubmission_log.to_csv('submission_log.csv',index=False)\n"

<h2 style="color:purple;">Logistic Regression With Polynomial Features</h2>

In [None]:
'''
# Initialize polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# Fit and transform the polynomial features for the training data
X_train_poly = poly.fit_transform(X_train)

# Transform the polynomial features for the validation and test data
X_val_poly = poly.transform(X_val)
test_df_poly = poly.transform(test_df)
'''


'\n# Initialize polynomial features\npoly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n# Fit and transform the polynomial features for the training data\nX_train_poly = poly.fit_transform(X_train)\n\n# Transform the polynomial features for the validation and test data\nX_val_poly = poly.transform(X_val)\ntest_df_poly = poly.transform(test_df)\n'

In [None]:
'''
model = LogisticRegression(max_iter=10000, solver='liblinear')
# Set up hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Norm used in penalization
}
grid_search_poly = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
# Fit the model with cross-validation on polynomial features
grid_search_poly.fit(X_train_poly, y_train)
# Best parameters found by GridSearchCV
best_params_poly = grid_search_poly.best_params_
print("Best hyperparameters:", best_params_poly)
#Best Hyperparameters are: {'C': 1, 'penalty': 'l1'}
'''

'\nmodel = LogisticRegression(max_iter=10000, solver=\'liblinear\')\n# Set up hyperparameter tuning with GridSearchCV\nparam_grid = {\n    \'C\': [0.1, 1, 10, 100],  # Regularization strength\n    \'penalty\': [\'l1\', \'l2\'],  # Norm used in penalization\n}\ngrid_search_poly = GridSearchCV(model, param_grid, cv=5, scoring=\'accuracy\', verbose=3, n_jobs=-1)\n# Fit the model with cross-validation on polynomial features\ngrid_search_poly.fit(X_train_poly, y_train)\n# Best parameters found by GridSearchCV\nbest_params_poly = grid_search_poly.best_params_\nprint("Best hyperparameters:", best_params_poly)\n#Best Hyperparameters are: {\'C\': 1, \'penalty\': \'l1\'}\n'

In [None]:
'''
best_model = LogisticRegression(C=1, max_iter=10000, penalty='l1', solver='liblinear')
# Fit the best model on the training data with polynomial features
best_model.fit(X_train_poly, y_train)
# Make predictions on the validation set with polynomial features
y_pred_val = best_model.predict(X_val_poly)
accuracy_val = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {accuracy_val}")
#The acuuracy_score on validation set is 0.74775
#other metrics to understand performance
# Precision
precision_logr_poly = precision_score(y_val, y_pred_val, average='weighted')
print(f"Precision: {precision_logr_poly:.4f}")

# Recall
recall_logr_poly = recall_score(y_val, y_pred_val, average='weighted')
print(f"Recall: {recall_logr_poly:.4f}")

# F1 Score
f1_logr_poly = f1_score(y_val, y_pred_val, average='weighted')
print(f"F1 Score: {f1_logr_poly:.4f}")
#Precision: 0.7393 Recall: 0.7482 F1 Score: 0.7265
'''

'\nbest_model = LogisticRegression(C=1, max_iter=10000, penalty=\'l1\', solver=\'liblinear\')\n# Fit the best model on the training data with polynomial features\nbest_model.fit(X_train_poly, y_train)\n# Make predictions on the validation set with polynomial features\ny_pred_val = best_model.predict(X_val_poly)\naccuracy_val = accuracy_score(y_val, y_pred_val)\nprint(f"Validation Accuracy: {accuracy_val}")\n#The acuuracy_score on validation set is 0.74775\n#other metrics to understand performance\n# Precision\nprecision_logr_poly = precision_score(y_val, y_pred_val, average=\'weighted\')\nprint(f"Precision: {precision_logr_poly:.4f}")\n\n# Recall\nrecall_logr_poly = recall_score(y_val, y_pred_val, average=\'weighted\')\nprint(f"Recall: {recall_logr_poly:.4f}")\n\n# F1 Score\nf1_logr_poly = f1_score(y_val, y_pred_val, average=\'weighted\')\nprint(f"F1 Score: {f1_logr_poly:.4f}")\n#Precision: 0.7393 Recall: 0.7482 F1 Score: 0.7265\n'

In [None]:
'''
test_predictions_log_p = best_model.predict(test_df_poly)
# create the submission dataframe of desired format
submission_log_p = pd.DataFrame(columns=['ID','Crime_Category'])
# fill in the predictions made in the submission file
submission_log_p['ID'] = [i+1 for i in range(len(test_predictions_log_p))]
submission_log_p['Crime_Category'] = test_predictions_log_p
# create a csv file for submission
#Logistic regression classiefier with polynomial degrees
submission_log_p.to_csv('submission_log_p.csv',index=False)
'''

"\ntest_predictions_log_p = best_model.predict(test_df_poly)\n# create the submission dataframe of desired format\nsubmission_log_p = pd.DataFrame(columns=['ID','Crime_Category'])\n# fill in the predictions made in the submission file \nsubmission_log_p['ID'] = [i+1 for i in range(len(test_predictions_log_p))]\nsubmission_log_p['Crime_Category'] = test_predictions_log_p\n# create a csv file for submission\n#Logistic regression classiefier with polynomial degrees\nsubmission_log_p.to_csv('submission_log_p.csv',index=False)\n"

<h2 style="color:purple;">Random Forest Classifier</h2>

In [None]:
'''
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
# Set up hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}
grid_search_rf = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
# Fit the model with cross-validation
grid_search_rf.fit(X_train, y_train)
# Best parameters found by GridSearchCV
best_params_rf = grid_search_rf.best_params_
print("Best hyperparameters:", best_params_rf)
#Best Hyperparameters are : {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
'''

'\n# Initialize the Random Forest model\nrf_model = RandomForestClassifier(random_state=42)\n# Set up hyperparameter tuning with GridSearchCV\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],  # Number of trees in the forest\n    \'max_depth\': [None, 10, 20, 30],  # Maximum depth of the tree\n    \'min_samples_split\': [2, 5, 10],  # Minimum number of samples required to split an internal node\n    \'min_samples_leaf\': [1, 2, 4]  # Minimum number of samples required to be at a leaf node\n} \ngrid_search_rf = GridSearchCV(rf_model, param_grid, cv=5, scoring=\'accuracy\', verbose=3, n_jobs=-1)\n# Fit the model with cross-validation\ngrid_search_rf.fit(X_train, y_train)\n# Best parameters found by GridSearchCV\nbest_params_rf = grid_search_rf.best_params_\nprint("Best hyperparameters:", best_params_rf)\n#Best Hyperparameters are : {\'max_depth\': None, \'min_samples_leaf\': 1, \'min_samples_split\': 5, \'n_estimators\': 200}\n'

In [None]:
'''
best_rf_model = RandomForestClassifier(
    random_state=42,
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200
)
# Fit the best model on the training data
best_rf_model.fit(X_train, y_train)
# Make predictions on the validation set
y_pred_val_rf = best_rf_model.predict(X_val)
# Calculate accuracy on the validation set
accuracy_val_rf = accuracy_score(y_val, y_pred_val_rf)
print(f"Validation Accuracy: {accuracy_val_rf}")
#Accuracy Score on validation set is : 0.875
#other metrics to understand performance
# Precision
precision_rf = precision_score(y_val, y_pred_val_rf, average='weighted')
print(f"Precision: {precision_rf:.4f}")

# Recall
recall_rf = recall_score(y_val, y_pred_val_rf, average='weighted')
print(f"Recall: {recall_rf:.4f}")

# F1 Score
f1_rf = f1_score(y_val, y_pred_val_rf, average='weighted')
print(f"F1 Score: {f1_rf:.4f}")
#Precision: 0.8687 Recall: 0.8750 F1 Score: 0.8676
'''

'\nbest_rf_model = RandomForestClassifier(\n    random_state=42,\n    max_depth=None,\n    min_samples_leaf=1,\n    min_samples_split=5,\n    n_estimators=200\n)\n# Fit the best model on the training data\nbest_rf_model.fit(X_train, y_train)\n# Make predictions on the validation set\ny_pred_val_rf = best_rf_model.predict(X_val)\n# Calculate accuracy on the validation set\naccuracy_val_rf = accuracy_score(y_val, y_pred_val_rf)\nprint(f"Validation Accuracy: {accuracy_val_rf}")\n#Accuracy Score on validation set is : 0.875\n#other metrics to understand performance\n# Precision\nprecision_rf = precision_score(y_val, y_pred_val_rf, average=\'weighted\')\nprint(f"Precision: {precision_rf:.4f}")\n\n# Recall\nrecall_rf = recall_score(y_val, y_pred_val_rf, average=\'weighted\')\nprint(f"Recall: {recall_rf:.4f}")\n\n# F1 Score\nf1_rf = f1_score(y_val, y_pred_val_rf, average=\'weighted\')\nprint(f"F1 Score: {f1_rf:.4f}")\n#Precision: 0.8687 Recall: 0.8750 F1 Score: 0.8676\n'

In [None]:
'''
# Make predictions on the test set
y_pred_test_rf = best_rf_model.predict(test_df)
#create the submission dataframe of desired format
submission_rf_new = pd.DataFrame(columns=['ID','Crime_Category'])
# fill in the predictions made in the submission file
submission_rf_new['ID'] = [i+1 for i in range(len(y_pred_test_rf))]
submission_rf_new['Crime_Category'] = y_pred_test_rf
# create a csv file for submission
#Random Forest Classifier
submission_rf_new.to_csv('submission_rf_new.csv',index=False)
'''

"\n# Make predictions on the test set\ny_pred_test_rf = best_rf_model.predict(test_df)\n#create the submission dataframe of desired format\nsubmission_rf_new = pd.DataFrame(columns=['ID','Crime_Category'])\n# fill in the predictions made in the submission file \nsubmission_rf_new['ID'] = [i+1 for i in range(len(y_pred_test_rf))]\nsubmission_rf_new['Crime_Category'] = y_pred_test_rf\n# create a csv file for submission\n#Random Forest Classifier\nsubmission_rf_new.to_csv('submission_rf_new.csv',index=False)\n"

<h2 style="color:purple;">XGBoost Model</h2>

In [None]:
'''
# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42)
# Set up hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}
grid_search_xgb = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
# Fit the model with cross-validation
grid_search_xgb.fit(X_train, y_train_encoded)
# Best parameters found by GridSearchCV
best_params = grid_search_xgb.best_params_
print("Best hyperparameters:", best_params)
#Best Hyperparameters : {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.9}
'''

'\n# Initialize the XGBoost classifier\nxgb_model = xgb.XGBClassifier(random_state=42)\n# Set up hyperparameter tuning with GridSearchCV\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [3, 5, 7],\n    \'learning_rate\': [0.01, 0.1, 0.2],\n    \'subsample\': [0.7, 0.8, 0.9],\n    \'colsample_bytree\': [0.7, 0.8, 0.9]\n}\ngrid_search_xgb = GridSearchCV(xgb_model, param_grid, cv=5, scoring=\'accuracy\', verbose=3, n_jobs=-1)\n# Fit the model with cross-validation\ngrid_search_xgb.fit(X_train, y_train_encoded)\n# Best parameters found by GridSearchCV\nbest_params = grid_search_xgb.best_params_\nprint("Best hyperparameters:", best_params)\n#Best Hyperparameters : {\'colsample_bytree\': 0.7, \'learning_rate\': 0.1, \'max_depth\': 7, \'n_estimators\': 100, \'subsample\': 0.9}\n'

In [None]:
'''
best_xgb_model = xgb.XGBClassifier(
    random_state=42,
    colsample_bytree=0.7,
    learning_rate=0.1,
    max_depth=7,
    n_estimators=100,
    subsample=0.9
)
# Fit the best model on the training data
best_xgb_model.fit(X_train, y_train_encoded)
# Make predictions on the validation set
y_pred_val = best_xgb_model.predict(X_val)
# Decode the predictions back to original labels
y_pred_val_decoded = label_encoder.inverse_transform(y_pred_val)
# Calculate accuracy on the validation set
accuracy_val = accuracy_score(y_val, y_pred_val_decoded)
print(f"Validation Accuracy: {accuracy_val}")
#Accuracy Score on the validation set is : 0.86125
#other metrics to understand performance
# Precision
precision_xgb = precision_score(y_val, y_pred_val_decoded, average='weighted')
print(f"Precision: {precision_xgb:.4f}")

# Recall
recall_xgb = recall_score(y_val, y_pred_val_decoded, average='weighted')
print(f"Recall: {recall_xgb:.4f}")

# F1 Score
f1_xgb = f1_score(y_val, y_pred_val_decoded, average='weighted')
print(f"F1 Score: {f1_xgb:.4f}")
#Precision: 0.8774 Recall: 0.8612 F1 Score: 0.8600
'''


'\nbest_xgb_model = xgb.XGBClassifier(\n    random_state=42,\n    colsample_bytree=0.7,\n    learning_rate=0.1,\n    max_depth=7,\n    n_estimators=100,\n    subsample=0.9\n)\n# Fit the best model on the training data\nbest_xgb_model.fit(X_train, y_train_encoded)\n# Make predictions on the validation set\ny_pred_val = best_xgb_model.predict(X_val)\n# Decode the predictions back to original labels\ny_pred_val_decoded = label_encoder.inverse_transform(y_pred_val)\n# Calculate accuracy on the validation set\naccuracy_val = accuracy_score(y_val, y_pred_val_decoded)\nprint(f"Validation Accuracy: {accuracy_val}")\n#Accuracy Score on the validation set is : 0.86125\n#other metrics to understand performance\n# Precision\nprecision_xgb = precision_score(y_val, y_pred_val_decoded, average=\'weighted\')\nprint(f"Precision: {precision_xgb:.4f}")\n\n# Recall\nrecall_xgb = recall_score(y_val, y_pred_val_decoded, average=\'weighted\')\nprint(f"Recall: {recall_xgb:.4f}")\n\n# F1 Score\nf1_xgb = f1_sco

In [None]:

'''
# Make predictions on the test set
y_pred_test = best_xgb_model.predict(test_df)
# Decode the test predictions back to original labels
y_pred_test_decoded = label_encoder.inverse_transform(y_pred_test)
# create the submission dataframe of desired format
submission_xgb_new = pd.DataFrame(columns=['ID','Crime_Category'])
# fill in the predictions made in the submission file
submission_xgb_new['ID'] = [i+1 for i in range(len(y_pred_test_decoded))]
submission_xgb_new['Crime_Category'] = y_pred_test_decoded
# create a csv file for submission
submission_xgb_new.to_csv('submission_xgb_new.csv',index=False)
'''


"\n# Make predictions on the test set\ny_pred_test = best_xgb_model.predict(test_df)\n# Decode the test predictions back to original labels\ny_pred_test_decoded = label_encoder.inverse_transform(y_pred_test)\n# create the submission dataframe of desired format\nsubmission_xgb_new = pd.DataFrame(columns=['ID','Crime_Category'])\n# fill in the predictions made in the submission file \nsubmission_xgb_new['ID'] = [i+1 for i in range(len(y_pred_test_decoded))]\nsubmission_xgb_new['Crime_Category'] = y_pred_test_decoded\n# create a csv file for submission\nsubmission_xgb_new.to_csv('submission_xgb_new.csv',index=False)\n"

<h2 style="color:purple;">KNeighborsClassifier</h2>

In [None]:
'''
# Initialize the KNN classifier
knn = KNeighborsClassifier()
# Set up hyperparameter tuning with GridSearchCV
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)
# Fit the model with cross-validation
grid_search_knn.fit(X_train, y_train_encoded)
# Best parameters found by GridSearchCV
best_params_knn = grid_search_knn.best_params_
print("Best hyperparameters for KNN:", best_params_knn)
#Best hyperparameters for KNN: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
'''

'\n# Initialize the KNN classifier\nknn = KNeighborsClassifier()\n# Set up hyperparameter tuning with GridSearchCV\nparam_grid_knn = {\n    \'n_neighbors\': [3, 5, 7, 9],\n    \'weights\': [\'uniform\', \'distance\'],\n    \'metric\': [\'euclidean\', \'manhattan\', \'minkowski\']\n}\ngrid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring=\'accuracy\', verbose=3, n_jobs=-1)\n# Fit the model with cross-validation\ngrid_search_knn.fit(X_train, y_train_encoded)\n# Best parameters found by GridSearchCV\nbest_params_knn = grid_search_knn.best_params_\nprint("Best hyperparameters for KNN:", best_params_knn)\n#Best hyperparameters for KNN: {\'metric\': \'manhattan\', \'n_neighbors\': 9, \'weights\': \'distance\'}\n'

In [None]:
'''
best_knn_model = KNeighborsClassifier(metric="manhattan",n_neighbors = 9,weights = "distance")
best_knn_model.fit(X_train,y_train_encoded)
# Make predictions on the validation set
y_pred_val_knn = best_knn_model.predict(X_val)
# Decode the predictions back to original labels
y_pred_val_knn_decoded = label_encoder.inverse_transform(y_pred_val_knn)
# Calculate accuracy on the validation set
accuracy_val_knn = accuracy_score(y_val, y_pred_val_knn_decoded)
print(f"KNN Validation Accuracy: {accuracy_val_knn}")
#Accuracy Score on Validation set is : 0.49725
#other metrics to understand performance
# Precision
precision_knn = precision_score(y_val, y_pred_val_knn_decoded, average='weighted')
print(f"Precision: {precision_knn:.4f}")

# Recall
recall_knn = recall_score(y_val, y_pred_val_knn_decoded, average='weighted')
print(f"Recall: {recall_knn:.4f}")

# F1 Score
f1_knn = f1_score(y_val, y_pred_val_knn_decoded, average='weighted')
print(f"F1 Score: {f1_knn:.4f}")
#Precision: 0.7602 Recall: 0.4973 F1 Score: 0.5480
'''

'\nbest_knn_model = KNeighborsClassifier(metric="manhattan",n_neighbors = 9,weights = "distance")\nbest_knn_model.fit(X_train,y_train_encoded)\n# Make predictions on the validation set\ny_pred_val_knn = best_knn_model.predict(X_val)\n# Decode the predictions back to original labels\ny_pred_val_knn_decoded = label_encoder.inverse_transform(y_pred_val_knn)\n# Calculate accuracy on the validation set\naccuracy_val_knn = accuracy_score(y_val, y_pred_val_knn_decoded)\nprint(f"KNN Validation Accuracy: {accuracy_val_knn}")\n#Accuracy Score on Validation set is : 0.49725\n#other metrics to understand performance\n# Precision\nprecision_knn = precision_score(y_val, y_pred_val_knn_decoded, average=\'weighted\')\nprint(f"Precision: {precision_knn:.4f}")\n\n# Recall\nrecall_knn = recall_score(y_val, y_pred_val_knn_decoded, average=\'weighted\')\nprint(f"Recall: {recall_knn:.4f}")\n\n# F1 Score\nf1_knn = f1_score(y_val, y_pred_val_knn_decoded, average=\'weighted\')\nprint(f"F1 Score: {f1_knn:.4f}")

<h2 style="color:purple;">LightGBM</h2>

In [None]:
'''
# Initialize the LightGBM classifier
lgb_model = lgb.LGBMClassifier(random_state=42)
# Set up hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'num_leaves': randint(2**3, 2**10),
    'min_data_in_leaf': randint(10, 100),  # Ensuring there are enough data points in a leaf
    'min_gain_to_split': uniform(0.0, 0.2)  # Requiring a minimum gain for splits
}
random_search_lgb = RandomizedSearchCV(lgb_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', verbose=3, n_jobs=-1, random_state=42)
random_search_lgb.fit(X_train,y_train_encoded)
best_params_lgb = random_search_lgb.best_params_
#Best hyperparameters for LightGBM: {'colsample_bytree': 0.7110049608671793, 'learning_rate': 0.06048738886880416, 'max_depth': 9, 'min_data_in_leaf': 12, 'min_gain_to_split': 0.17904136753743988, 'n_estimators': 337, 'num_leaves': 370, 'subsample': 0.8596340455795947}
'''

"\n# Initialize the LightGBM classifier\nlgb_model = lgb.LGBMClassifier(random_state=42)\n# Set up hyperparameter tuning with RandomizedSearchCV\nparam_dist = {\n    'n_estimators': randint(100, 500),\n    'max_depth': randint(3, 10),\n    'learning_rate': uniform(0.01, 0.2),\n    'subsample': uniform(0.7, 0.3),\n    'colsample_bytree': uniform(0.7, 0.3),\n    'num_leaves': randint(2**3, 2**10),\n    'min_data_in_leaf': randint(10, 100),  # Ensuring there are enough data points in a leaf\n    'min_gain_to_split': uniform(0.0, 0.2)  # Requiring a minimum gain for splits\n}\nrandom_search_lgb = RandomizedSearchCV(lgb_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', verbose=3, n_jobs=-1, random_state=42)\nrandom_search_lgb.fit(X_train,y_train_encoded)\nbest_params_lgb = random_search_lgb.best_params_\n#Best hyperparameters for LightGBM: {'colsample_bytree': 0.7110049608671793, 'learning_rate': 0.06048738886880416, 'max_depth': 9, 'min_data_in_leaf': 12, 'min_ga

In [None]:
'''
# Initialize the LightGBM classifier with best parameters
best_params_lgb = {
    'colsample_bytree': 0.7110049608671793,
    'learning_rate': 0.06048738886880416,
    'max_depth': 9,
    'min_data_in_leaf': 12,
    'min_gain_to_split': 0.17904136753743988,
    'n_estimators': 337,
    'num_leaves': 370,
    'subsample': 0.8596340455795947
}
best_lgb_model = lgb.LGBMClassifier(random_state=42, **best_params_lgb,verbose=-1)
# Fit the best model on the training data
best_lgb_model.fit(X_train, y_train_encoded)
# Make predictions on the validation set
y_pred_val_lgb = best_lgb_model.predict(X_val)
# Decode the predictions back to original labels
y_pred_val_lgb_decoded = label_encoder.inverse_transform(y_pred_val_lgb)
# Calculate accuracy on the validation set
accuracy_val_lgb = accuracy_score(y_val, y_pred_val_lgb_decoded)
print(f"LightGBM Validation Accuracy: {accuracy_val_lgb}")
#Accuracy Score on the validation set is : 0.85825
# Precision
precision_lgbm = precision_score(y_val, y_pred_val_lgb_decoded, average='weighted')
print(f"Precision: {precision_lgbm:.4f}")

# Recall
recall_lgbm = recall_score(y_val, y_pred_val_lgb_decoded, average='weighted')
print(f"Recall: {recall_lgbm:.4f}")

# F1 Score
f1_lgbm = f1_score(y_val, y_pred_val_lgb_decoded, average='weighted')
print(f"F1 Score: {f1_lgbm:.4f}")
#Precision: 0.8759 Recall: 0.8582 F1 Score: 0.8566
'''


'\n# Initialize the LightGBM classifier with best parameters\nbest_params_lgb = {\n    \'colsample_bytree\': 0.7110049608671793,\n    \'learning_rate\': 0.06048738886880416,\n    \'max_depth\': 9,\n    \'min_data_in_leaf\': 12,\n    \'min_gain_to_split\': 0.17904136753743988,\n    \'n_estimators\': 337,\n    \'num_leaves\': 370,\n    \'subsample\': 0.8596340455795947\n}\nbest_lgb_model = lgb.LGBMClassifier(random_state=42, **best_params_lgb,verbose=-1)\n# Fit the best model on the training data\nbest_lgb_model.fit(X_train, y_train_encoded)\n# Make predictions on the validation set\ny_pred_val_lgb = best_lgb_model.predict(X_val)\n# Decode the predictions back to original labels\ny_pred_val_lgb_decoded = label_encoder.inverse_transform(y_pred_val_lgb)\n# Calculate accuracy on the validation set\naccuracy_val_lgb = accuracy_score(y_val, y_pred_val_lgb_decoded)\nprint(f"LightGBM Validation Accuracy: {accuracy_val_lgb}")\n#Accuracy Score on the validation set is : 0.85825\n# Precision\nprec

In [None]:
'''
y_pred_test_lgb = best_lgb_model.predict(test_df)
#Decode the test predictions back to original labels
y_pred_test_lgb_decoded = label_encoder.inverse_transform(y_pred_test_lgb)
#create the submission dataframe of desired format
submission_lgb_new = pd.DataFrame(columns=['ID','Crime_Category'])
#fill in the predictions made in the submission file
submission_lgb_new['ID'] = [i+1 for i in range(len(y_pred_test_lgb_decoded))]
submission_lgb_new['Crime_Category'] = y_pred_test_lgb_decoded
#create a csv file for submission
submission_lgb_new.to_csv('submission_lgb_new.csv',index=False)
'''


"\ny_pred_test_lgb = best_lgb_model.predict(test_df)\n#Decode the test predictions back to original labels\ny_pred_test_lgb_decoded = label_encoder.inverse_transform(y_pred_test_lgb)\n#create the submission dataframe of desired format\nsubmission_lgb_new = pd.DataFrame(columns=['ID','Crime_Category'])\n#fill in the predictions made in the submission file\nsubmission_lgb_new['ID'] = [i+1 for i in range(len(y_pred_test_lgb_decoded))] \nsubmission_lgb_new['Crime_Category'] = y_pred_test_lgb_decoded\n#create a csv file for submission\nsubmission_lgb_new.to_csv('submission_lgb_new.csv',index=False)\n"

<h2 style="color:purple;">Stacking Classifier</h2>

In [None]:
'''
# Initialize base models with the best parameters
xgb_model = xgb.XGBClassifier( random_state=42, colsample_bytree=0.7, learning_rate=0.1, max_depth=7, n_estimators=100, subsample=0.9 )
lgb_model = lgb.LGBMClassifier(random_state=42, colsample_bytree=0.7110049608671793, learning_rate=0.06048738886880416, max_depth=9, min_data_in_leaf=12, min_gain_to_split=0.17904136753743988, n_estimators=337, num_leaves=370, subsample=0.8596340455795947)
rf_model = RandomForestClassifier( random_state=42, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200 )

# Initialize Stacking Classifier with Logistic Regression as the meta-learner
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('rf', rf_model)
    ],
    final_estimator=LogisticRegression()
)
# Fit the stacking model
stacking_model.fit(X_train, y_train_encoded)
# Make predictions on the validation set
y_pred_val_stacking = stacking_model.predict(X_val)
# Decode the predictions back to original labels
y_pred_val_stacking_decoded = label_encoder.inverse_transform(y_pred_val_stacking)
# Calculate accuracy on the validation set
accuracy_val_stacking = accuracy_score(y_val, y_pred_val_stacking_decoded)
print(f"Stacking Model Validation Accuracy: {accuracy_val_stacking}")
#Accuracy Score on Validation Set is : 0.85775

#other metrics to understand performance
# Precision
precision_stack = precision_score(y_val, y_pred_val_stacking_decoded, average='weighted')
print(f"Precision: {precision_stack:.4f}")

# Recall
recall_stack = recall_score(y_val, y_pred_val_stacking_decoded, average='weighted')
print(f"Recall: {recall_stack:.4f}")

# F1 Score
f1_stack = f1_score(y_val, y_pred_val_stacking_decoded, average='weighted')
print(f"F1 Score: {f1_stack:.4f}")
#Precision: 0.8722 Recall: 0.8602 F1 Score: 0.8578
'''

'\n# Initialize base models with the best parameters\nxgb_model = xgb.XGBClassifier( random_state=42, colsample_bytree=0.7, learning_rate=0.1, max_depth=7, n_estimators=100, subsample=0.9 )\nlgb_model = lgb.LGBMClassifier(random_state=42, colsample_bytree=0.7110049608671793, learning_rate=0.06048738886880416, max_depth=9, min_data_in_leaf=12, min_gain_to_split=0.17904136753743988, n_estimators=337, num_leaves=370, subsample=0.8596340455795947)\nrf_model = RandomForestClassifier( random_state=42, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200 )\n\n# Initialize Stacking Classifier with Logistic Regression as the meta-learner\nstacking_model = StackingClassifier(\n    estimators=[\n        (\'xgb\', xgb_model),\n        (\'lgb\', lgb_model),\n        (\'rf\', rf_model)\n    ],\n    final_estimator=LogisticRegression()\n)\n# Fit the stacking model\nstacking_model.fit(X_train, y_train_encoded)\n# Make predictions on the validation set\ny_pred_val_stacking = stacki

In [None]:
'''
#Make predictions on the test set
y_pred_test_stacking = stacking_model.predict(test_df)
#Decode the test predictions back to original labels
y_pred_test_decoded_stacking = label_encoder.inverse_transform(y_pred_test_stacking)
#create the submission dataframe of desired format
submission_stacking = pd.DataFrame(columns=['ID','Crime_Category'])
#fill in the predictions made in the submission file
submission_stacking['ID'] = [i+1 for i in range(len(y_pred_test_decoded_stacking))]
submission_stacking['Crime_Category'] = y_pred_test_decoded_stacking
#create a csv file for submission
submission_stacking.to_csv('submission_stacking.csv',index=False)
'''

"\n#Make predictions on the test set\ny_pred_test_stacking = stacking_model.predict(test_df)\n#Decode the test predictions back to original labels\ny_pred_test_decoded_stacking = label_encoder.inverse_transform(y_pred_test_stacking)\n#create the submission dataframe of desired format\nsubmission_stacking = pd.DataFrame(columns=['ID','Crime_Category'])\n#fill in the predictions made in the submission file\nsubmission_stacking['ID'] = [i+1 for i in range(len(y_pred_test_decoded_stacking))] \nsubmission_stacking['Crime_Category'] = y_pred_test_decoded_stacking\n#create a csv file for submission\nsubmission_stacking.to_csv('submission_stacking.csv',index=False)\n"

<h2 style="color:purple;">LGBM With OverSampling Using SMOTE</h2>

In [None]:
'''
#best_params_lgb = { 'colsample_bytree': 0.7110049608671793, 'learning_rate': 0.06048738886880416, 'max_depth': 9, 'min_data_in_leaf': 12, 'min_gain_to_split': 0.17904136753743988, 'n_estimators': 337, 'num_leaves': 370, 'subsample': 0.8596340455795947 }
lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced',
                                colsample_bytree=0.7110049608671793,
                                learning_rate=0.06048738886880416,
                                max_depth=9,
                                min_data_in_leaf=12,
                                min_gain_to_split=0.17904136753743988,
                                n_estimators=337,
                                num_leaves=370,
                                subsample=0.8596340455795947)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_encoded)
lgb_model.fit(X_train_resampled, y_train_resampled)
y_pred_val_lgb = lgb_model.predict(X_val)
y_pred_val_lgb_decoded = label_encoder.inverse_transform(y_pred_val_lgb)
accuracy_val_lgb = accuracy_score(y_val, y_pred_val_lgb_decoded)
print(f"LightGBM Validation Accuracy with Class Weight Adjustment: {accuracy_val_lgb}")
#Accuracy Score on Validation Set is : 0.86

#other metrics to understand performance
# Precision
precision_smote = precision_score(y_val, y_pred_val_lgb_decoded, average='weighted')
print(f"Precision: {precision_smote:.4f}")

# Recall
recall_smote = recall_score(y_val, y_pred_val_lgb_decoded, average='weighted')
print(f"Recall: {recall_smote:.4f}")

# F1 Score
f1_smote = f1_score(y_val, y_pred_val_lgb_decoded, average='weighted')
print(f"F1 Score: {f1_smote:.4f}")
#Precision: 0.8803 Recall: 0.8600 F1 Score: 0.8586
'''

'\n#best_params_lgb = { \'colsample_bytree\': 0.7110049608671793, \'learning_rate\': 0.06048738886880416, \'max_depth\': 9, \'min_data_in_leaf\': 12, \'min_gain_to_split\': 0.17904136753743988, \'n_estimators\': 337, \'num_leaves\': 370, \'subsample\': 0.8596340455795947 }\nlgb_model = lgb.LGBMClassifier(random_state=42, class_weight=\'balanced\', \n                                colsample_bytree=0.7110049608671793, \n                                learning_rate=0.06048738886880416, \n                                max_depth=9, \n                                min_data_in_leaf=12, \n                                min_gain_to_split=0.17904136753743988, \n                                n_estimators=337, \n                                num_leaves=370, \n                                subsample=0.8596340455795947)\n\n# Apply SMOTE to the training data\nsmote = SMOTE(random_state=42)\nX_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_encoded)\nlgb_model.fit

In [None]:
'''
y_pred_test = lgb_model.predict(test_df)
y_pred_test_lgb_decoded = label_encoder.inverse_transform(y_pred_test)
submission_lgb_smote = pd.DataFrame(columns=['ID','Crime_Category'])
submission_lgb_smote['ID'] = [i+1 for i in range(len(y_pred_test_lgb_decoded))]
submission_lgb_smote['Crime_Category'] = y_pred_test_lgb_decoded
submission_lgb_smote.to_csv('submission_lgb_smote.csv',index=False)
'''

"\ny_pred_test = lgb_model.predict(test_df)\ny_pred_test_lgb_decoded = label_encoder.inverse_transform(y_pred_test)\nsubmission_lgb_smote = pd.DataFrame(columns=['ID','Crime_Category'])\nsubmission_lgb_smote['ID'] = [i+1 for i in range(len(y_pred_test_lgb_decoded))] \nsubmission_lgb_smote['Crime_Category'] = y_pred_test_lgb_decoded\nsubmission_lgb_smote.to_csv('submission_lgb_smote.csv',index=False)\n"

<h2 style="color:purple;">Support Vector Machine</h2>

In [None]:
'''
# Define SVM with specified parameters
svm_model_tuned = SVC(kernel='rbf', C=1, gamma='scale', random_state=42, class_weight='balanced')

# Fit the model on the original training data
svm_model_tuned.fit(X_train, y_train_encoded)

# Make predictions on the validation set
y_pred_val_svm_tuned = svm_model_tuned.predict(X_val)

# Decode the predictions back to original labels
y_pred_val_svm_tuned_decoded = label_encoder.inverse_transform(y_pred_val_svm_tuned)

# Calculate accuracy
accuracy_val_svm_tuned = accuracy_score(y_val, y_pred_val_svm_tuned_decoded)
print(f"SVM Validation Accuracy (Tuned): {accuracy_val_svm_tuned}")

#Accuracy Score on Validation set is : 0.5285

#other metrics to understand performance
# Precision
precision_svm = precision_score(y_val, y_pred_val_svm_tuned_decoded, average='weighted')
print(f"Precision: {precision_svm:.4f}")

# Recall
recall_svm = recall_score(y_val, y_pred_val_svm_tuned_decoded, average='weighted')
print(f"Recall: {recall_svm:.4f}")

# F1 Score
f1_svm = f1_score(y_val, y_pred_val_svm_tuned_decoded, average='weighted')
print(f"F1 Score: {f1_svm:.4f}")

Precision: 0.5359 Recall: 0.5285 F1 Score: 0.4458
'''




'\n# Define SVM with specified parameters\nsvm_model_tuned = SVC(kernel=\'rbf\', C=1, gamma=\'scale\', random_state=42, class_weight=\'balanced\')\n\n# Fit the model on the original training data\nsvm_model_tuned.fit(X_train, y_train_encoded)\n\n# Make predictions on the validation set\ny_pred_val_svm_tuned = svm_model_tuned.predict(X_val)\n\n# Decode the predictions back to original labels\ny_pred_val_svm_tuned_decoded = label_encoder.inverse_transform(y_pred_val_svm_tuned)\n\n# Calculate accuracy\naccuracy_val_svm_tuned = accuracy_score(y_val, y_pred_val_svm_tuned_decoded)\nprint(f"SVM Validation Accuracy (Tuned): {accuracy_val_svm_tuned}")\n\n#Accuracy Score on Validation set is : 0.5285\n\n#other metrics to understand performance\n# Precision\nprecision_svm = precision_score(y_val, y_pred_val_svm_tuned_decoded, average=\'weighted\')\nprint(f"Precision: {precision_svm:.4f}")\n\n# Recall\nrecall_svm = recall_score(y_val, y_pred_val_svm_tuned_decoded, average=\'weighted\')\nprint(f"Re

<h2 style="color:purple;">Models Comparison</h2>

| **Model Name**                          | **Validation Accuracy** | **Test Accuracy** | **Precision** | **Recall** | **F1 Score** |
|-----------------------------------------|-------------------------|-------------------|---------------|------------|--------------|
| Dummy Classifier                        | 0.57575                 | 0.5866            | N/A           | N/A        | N/A          |
| Logistic Regression Classifier          | 0.65325                 | 0.6728            | 0.6034        | 0.6532     | 0.5891       |
| Logistic Regression With Polynomial Features | 0.74775                 | 0.6756            | 0.7393        | 0.7482     | 0.7265       |
| KNN Classifier                          | 0.49725                 | 0.6728            | 0.7602        | 0.4973     | 0.5480       |
| SVM                                     | 0.5285                  | 0.6728            | 0.5359        | 0.5285     | 0.4458       |
| **Random Forest Classifier**            | **0.84825**             | **0.8462**        | **0.8514**    | **0.8482** | **0.8368**   |
| **LGBM Classifier**                     | **0.85825**             | **0.8654**        | **0.8759**    | **0.8582** | **0.8566**   |
| **XGBoost Classifier**                  | **0.86125**             | **0.8668**        | **0.8774**    | **0.8612** | **0.8600**   |
| **Stacking Classifier**                 | **0.85775**             | **0.8562**        | **0.8722**    | **0.8602** | **0.8578**   |
| **LGBM with SMOTE (Oversampling)**      | **0.86**                | **0.862**         | **0.8803**    | **0.8600** | **0.8586**   |


### Insights into Model Performances

1. **Logistic Regression Classifier**:
   - **Accuracy**: Achieved a validation accuracy of 0.65325 and test accuracy of 0.6728.
   - **Precision**: 0.6034
   - **Recall**: 0.6532
   - **F1 Score**: 0.5891
   - **Insight**: This model shows a balanced performance but falls short in precision and F1 score, indicating that it may struggle with correctly classifying positive instances.

2. **Logistic Regression with Polynomial Features**:
   - **Accuracy**: Improved validation accuracy to 0.74775, but test accuracy is 0.6756.
   - **Precision**: 0.7393
   - **Recall**: 0.7482
   - **F1 Score**: 0.7265
   - **Insight**: Polynomial features increased the complexity and performance on the validation set, but the test set performance did not improve significantly, suggesting potential overfitting.

3. **KNN Classifier**:
   - **Accuracy**: Validation accuracy of 0.49725 and test accuracy of 0.6728.
   - **Precision**: 0.7602
   - **Recall**: 0.4973
   - **F1 Score**: 0.5480
   - **Insight**: The KNN classifier shows high precision but low recall, indicating it is conservative in predicting positive instances but does so with high accuracy when it does.

4. **SVM**:
   - **Accuracy**: Validation accuracy of 0.5285 and test accuracy of 0.6728.
   - **Precision**: 0.5359
   - **Recall**: 0.5285
   - **F1 Score**: 0.4458
   - **Insight**: The SVM model struggles with this dataset, reflected by its lower precision, recall, and F1 score.

5. **Random Forest Classifier**:
   - **Accuracy**: Validation accuracy of 0.84825 and test accuracy of 0.8462.
   - **Precision**: 0.8514
   - **Recall**: 0.8482
   - **F1 Score**: 0.8368
   - **Insight**: The Random Forest model performs well with high accuracy, precision, recall, and F1 score, making it a strong candidate for this classification task.

6. **LGBM Classifier**:
   - **Accuracy**: Validation accuracy of 0.85825 and test accuracy of 0.8654.
   - **Precision**: 0.8759
   - **Recall**: 0.8582
   - **F1 Score**: 0.8566
   - **Insight**: LightGBM shows excellent performance across all metrics, particularly high precision and F1 score, indicating robust model performance.

7. **XGBoost Classifier**:
   - **Accuracy**: Validation accuracy of 0.86125 and test accuracy of 0.8668.
   - **Precision**: 0.8774
   - **Recall**: 0.8612
   - **F1 Score**: 0.8600
   - **Insight**: XGBoost provides the best test accuracy, precision, recall, and F1 score, making it the top-performing model for this dataset.

8. **Stacking Classifier**:
   - **Accuracy**: Validation accuracy of 0.85775 and test accuracy of 0.8562.
   - **Precision**: 0.8722
   - **Recall**: 0.8602
   - **F1 Score**: 0.8578
   - **Insight**: The stacking classifier performs similarly to XGBoost and LightGBM, indicating that an ensemble of models can achieve competitive performance.

9. **LGBM with SMOTE (Oversampling)**:
   - **Accuracy**: Validation accuracy of 0.86 and test accuracy of 0.862.
   - **Precision**: 0.8803
   - **Recall**: 0.8600
   - **F1 Score**: 0.8586
   - **Insight**: Using SMOTE with LightGBM improves the precision slightly, though overall performance remains similar to other top models. This suggests handling class imbalance can marginally benefit the model.

### Conclusion

The XGBoost Classifier emerges as the best-performing model based on test accuracy and overall balanced performance across precision, recall, and F1 score. Models like LightGBM and Random Forest also perform very well, showing the effectiveness of ensemble methods for this classification task. Using techniques like SMOTE for handling class imbalance further refines the model's performance.


**As concluded XGBoost Classifier is the best performer, so let's train the model with full data for submission with all the preprocessing steps**

In [None]:
# Impute missing values for numerical columns
num_imputer = SimpleImputer(strategy='mean')
train_df[numerical_cols] = num_imputer.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])

# Impute missing values for categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
train_df[categorical_cols] = cat_imputer.fit_transform(train_df[categorical_cols])
test_df[categorical_cols] = cat_imputer.transform(test_df[categorical_cols])

# Function for frequency encoding
def frequency_encoding_final(column, df, df_test=None):
    freq_encoding = df[column].value_counts() / len(df)
    df[column + '_freq'] = df[column].map(freq_encoding)
    if df_test is not None:
        df_test[column + '_freq'] = df_test[column].map(freq_encoding).fillna(0)
    return df, df_test

# Define high cardinality and ordinal encoder columns
high_cardinality_cols = ['Location', 'Cross_Street', 'Modus_Operandi', 'Premise_Description', 'Weapon_Description']
ordinal_encoder_cols = ['Area_Name', 'Part1-2', 'Victim_Sex', 'Victim_Descent', 'Status', 'Status_Description']

# Apply Frequency Encoding for high cardinality columns
for col in high_cardinality_cols:
    train_df, test_df = frequency_encoding_final(col, train_df, test_df)

# Apply Ordinal Encoder for other categorical columns
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_df[ordinal_encoder_cols] = oe.fit_transform(train_df[ordinal_encoder_cols])
test_df[ordinal_encoder_cols] = oe.transform(test_df[ordinal_encoder_cols])

# Drop the original high cardinality columns as we have their frequency encoded versions
train_df = train_df.drop(columns=high_cardinality_cols)
test_df = test_df.drop(columns=high_cardinality_cols)

# Separate features and target
X_train_full = train_df.drop(columns='Crime_Category')
y_train_full = train_df['Crime_Category']

# Use SimpleImputer to fill any remaining missing values (if any) - only for X_train_full and X_test
imputer = SimpleImputer(strategy='most_frequent')
X_train_full = pd.DataFrame(imputer.fit_transform(X_train_full), columns=X_train_full.columns)
test_df = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)

# Scale numerical features
scaler = StandardScaler()
X_train_full[numerical_cols] = scaler.fit_transform(X_train_full[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# Encode target variable
label_encoder = LabelEncoder()
y_train_full = label_encoder.fit_transform(y_train_full)

# Initialize the LightGBM classifier with best parameters
best_params_lgb_final = {
    'colsample_bytree': 0.7110049608671793,
    'learning_rate': 0.06048738886880416,
    'max_depth': 9,
    'min_data_in_leaf': 12,
    'min_gain_to_split': 0.17904136753743988,
    'n_estimators': 337,
    'num_leaves': 370,
    'subsample': 0.8596340455795947
}
best_lgb_model_final = lgb.LGBMClassifier(random_state=42, **best_params_lgb_final,verbose=-1)
best_lgb_model_final.fit(X_train_full, y_train_full)

# Make predictions on the test set
y_pred_test_final = best_lgb_model_final.predict(test_df)
y_pred_test_decoded_final = label_encoder.inverse_transform(y_pred_test_final)







In [None]:

submission_lgb_final = pd.DataFrame(columns=['ID','Crime_Category'])
submission_lgb_final['ID'] = [i+1 for i in range(len(y_pred_test_decoded_final))]
submission_lgb_final['Crime_Category'] = y_pred_test_decoded_final
submission_lgb_final.to_csv('submission_lgb_final.csv',index=False)


