In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,  StandardScaler, MinMaxScaler,MultiLabelBinarizer,LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# data preprocessing
train = pd.read_csv("train.csv")
train=train.drop_duplicates()
train.drop(columns=['Cross_Street'],inplace=True)
train['Weapon_Used_Code']=train['Weapon_Used_Code'].fillna(train['Weapon_Used_Code'].max()+ 1)
mode_victim_sex_value = train['Victim_Sex'].mode().iloc[0]  # Find the most frequent value
train['Victim_Sex']=train['Victim_Sex'].fillna(mode_victim_sex_value)
mode_victim_descent_value = train['Victim_Descent'].mode().iloc[0]  # Find the most frequent value
train['Victim_Descent']=train['Victim_Descent'].fillna(mode_victim_sex_value)
mode_modus_operandi_value = train['Modus_Operandi'].mode().iloc[0]  # Find the most frequent value
train['Modus_Operandi']=train['Modus_Operandi'].fillna(mode_modus_operandi_value)
train['Modus_Operandi']=train['Modus_Operandi'].apply(lambda x:str(x).split(" ") if x else [])
train['Date_Reported'] = pd.to_datetime(train['Date_Reported'],format='%m/%d/%Y %I:%M:%S %p')
train['Date_Occurred'] = pd.to_datetime(train['Date_Occurred'],format='%m/%d/%Y %I:%M:%S %p')
train['Year_Reported'] = train['Date_Reported'].dt.year
train['Month_Reported'] = train['Date_Reported'].dt.month
train['Day_Reported'] = train['Date_Reported'].dt.day
train['Year_Occurred'] = train['Date_Occurred'].dt.year
train['Month_Occurred'] = train['Date_Occurred'].dt.month
train['Day_Occurred'] = train['Date_Occurred'].dt.day
train['Time_Occurred'] = train['Time_Occurred'].apply(lambda x: int(x // 100))
label_encoder = LabelEncoder()
train['Crime_Category'] = label_encoder.fit_transform(train['Crime_Category'])
train.drop(columns=['Year_Occurred'],inplace=True)
corr_df = train.corr(numeric_only=True)
columns_to_convert = ['Latitude', 'Longitude', 'Area_ID','Reporting_District_no', 'Part 1-2', 'Victim_Age','Premise_Code','Weapon_Used_Code']
train[columns_to_convert] = train[columns_to_convert].astype(int)
included_columns=['Location', 'Latitude', 'Longitude','Area_ID','Reporting_District_no','Part 1-2','Modus_Operandi', 'Victim_Age','Victim_Sex', 'Victim_Descent', 'Premise_Code','Weapon_Used_Code','Status','Crime_Category', 'Year_Reported',
       'Month_Reported', 'Day_Reported', 'Month_Occurred', 'Day_Occurred']
new_train=train[included_columns]
X = new_train.drop('Crime_Category', axis=1)
y = new_train['Crime_Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Custom Transformer for MultiLabelBinarizer for the feature Modus Operandi, which is multilabeled
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ml_binarizers = {}

    def fit(self, X, y=None):
        """
        Fit a MultiLabelBinarizer for each column in the input DataFrame.
        """
        for column in X.columns:
            mlb = MultiLabelBinarizer()
            mlb.fit(X[column])
            self.ml_binarizers[column] = mlb
        return self

    def transform(self, X):
        """
        Transform the input DataFrame by applying the corresponding MultiLabelBinarizer
        for each column and concatenating the binary matrices.
        """
        X_transformed = []
        for column in X.columns:
            mlb = self.ml_binarizers[column]
            transformed_data = mlb.transform(X[column])
            # Create a DataFrame with meaningful column names
            transformed_df = pd.DataFrame(
                transformed_data,
                columns=[f"{column}_{cls}" for cls in mlb.classes_]
            )
            X_transformed.append(transformed_df)
        # Concatenate all transformed columns
        return pd.concat(X_transformed, axis=1)

In [4]:
# Define ColumnTransformer with merged transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'),['Area_ID','Reporting_District_no','Part 1-2','Victim_Sex','Victim_Descent','Premise_Code','Weapon_Used_Code','Status','Victim_Sex','Location']),
        ('minmax_scaler', MinMaxScaler(),['Latitude', 'Longitude','Victim_Age'])
        , ('modus_operandi', MultiLabelBinarizerTransformer(),['Modus_Operandi'])
        ],
        remainder='passthrough'  # Pass through any remaining columns that are not specified in transformers
        )
pipeline = make_pipeline(
    preprocessor,
    StandardScaler(with_mean=False),
    DecisionTreeClassifier(random_state=42)    
)

In [None]:
# hyperparameters tuning for onehot encoding
grid = dict(
    {
        'decisiontreeclassifier__splitter': ['best', 'random']
    }
)
pipe_cv = GridSearchCV(pipeline,grid,cv=6,verbose=1,n_jobs=-1)
if pipe_cv is not None:
    pipe_cv.fit(X_train, y_train)
y_pred = pipe_cv.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 93.95%