In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [23]:
# data preprocessing with OneHot encoding

X = pd.read_csv("train.csv")
y = X['Crime_Category']
X = X.drop('Crime_Category', axis=1)

# remove column with 80% null values
X.drop('Cross_Street', axis=1, inplace=True)

# handle missing data
X['Victim_Sex'] = X['Victim_Sex'].replace(['H', 'X'], 'Unknown')
X['Victim_Descent'] = X['Victim_Descent'].fillna('Unknown')
X['Weapon_Description'] = X['Weapon_Description'].fillna('No Weapon')
X['Weapon_Used_Code'] = X['Weapon_Used_Code'].fillna(0) # Weapon_Used_Code is in the range [1,3990], 0 is for missing code
X['Modus_Operandi'] = X['Modus_Operandi'].fillna('Unknown')

# data handling
X['Date_Reported'] = pd.to_datetime(X['Date_Reported'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
X['Date_Occurred'] = pd.to_datetime(X['Date_Occurred'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
X['Year_Reported'] = X.Date_Reported.dt.year
X['Year_Occurred'] = X.Date_Occurred.dt.year
X['Month_Reported'] = X.Date_Reported.dt.month
X['Month_Occurred'] = X.Date_Occurred.dt.month
X['Day_Reported'] = X.Date_Reported.dt.day
X['Day_Occurred'] = X.Date_Occurred.dt.day
X.drop(['Date_Reported', 'Date_Occurred'], axis=1, inplace=True)

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
categorical_columns = [col for col in categorical_columns if col != 'Modus_Operandi']

numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
    )

modus_operandi_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    CountVectorizer(preprocessor=lambda x:x[0])
)

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_columns),
    ('cat', categorical_pipeline, categorical_columns),
    ('modus_operandi', modus_operandi_pipeline, ['Modus_Operandi'])
  ])


In [17]:
# data preprocessing with Label encoding

X = pd.read_csv("train.csv")
y = X['Crime_Category']
X = X.drop('Crime_Category', axis=1)

# remove column with 80% null values
X.drop('Cross_Street', axis=1, inplace=True)

# handle missing data
X['Victim_Sex'] = X['Victim_Sex'].replace(['H', 'X'], 'Unknown')
X['Victim_Descent'] = X['Victim_Descent'].fillna('Unknown')
X['Weapon_Description'] = X['Weapon_Description'].fillna('No Weapon')
X['Weapon_Used_Code'] = X['Weapon_Used_Code'].fillna(0) # Weapon_Used_Code is in the range [1,3990], 0 is for missing code
X['Modus_Operandi'] = X['Modus_Operandi'].fillna('Unknown')

# data handling
X['Date_Reported'] = pd.to_datetime(X['Date_Reported'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
X['Date_Occurred'] = pd.to_datetime(X['Date_Occurred'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
X['Year_Reported'] = X.Date_Reported.dt.year
X['Year_Occurred'] = X.Date_Occurred.dt.year
X['Month_Reported'] = X.Date_Reported.dt.month
X['Month_Occurred'] = X.Date_Occurred.dt.month
X['Day_Reported'] = X.Date_Reported.dt.day
X['Day_Occurred'] = X.Date_Occurred.dt.day
X.drop(['Date_Reported', 'Date_Occurred'], axis=1, inplace=True)

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

def label_encoding_column(df, column):
    lab_encoder = LabelEncoder()
    df[column] = lab_encoder.fit_transform(df[column])
    return df

for col in categorical_columns:
  if col == 'Modus_Operandi':
    continue
  X = label_encoding_column(X, col)

modus_operandi_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    CountVectorizer(preprocessor=lambda x:x[0])
)

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_columns),
    ('modus_operandi', modus_operandi_pipeline, ['Modus_Operandi'])
  ])


Accuracy with OneHot encoding before hyper parameters tuning: 93.55%

Accuracy with Label encoding before hyper parameters tuning: 92.9%

In [24]:
# full pipeline
pipe = make_pipeline(
    preprocessor,
    DecisionTreeClassifier(random_state=42)
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# encode target labels
y_encoder = LabelEncoder()
y_train_encoded = y_encoder.fit_transform(y_train.values.ravel())
y_test_encoded = y_encoder.transform(y_test.values.ravel())

# evaluation before tuning
#pipe.fit(X_train, y_train_encoded)
#y_pred = pipe.predict(X_test)
#accuracy = accuracy_score(y_test_encoded, y_pred)
#class_report = classification_report(y_test_encoded, y_pred)
#print("Decision Tree Performance before tuning:")
#print(class_report)
#print("Accuracy before tuning:", accuracy)


In [20]:
# hyper parameters visualizations
hparams = pipe.get_params()
for hp, val in hparams.items():
    if type(val) not in [int, float, str]:
        continue
    print(f"{hp}: {val}")

columntransformer__remainder: drop
columntransformer__sparse_threshold: 0.3
columntransformer__num__simpleimputer__missing_values: nan
columntransformer__num__simpleimputer__strategy: median
columntransformer__modus_operandi__simpleimputer__missing_values: nan
columntransformer__modus_operandi__simpleimputer__strategy: most_frequent
columntransformer__modus_operandi__countvectorizer__analyzer: word
columntransformer__modus_operandi__countvectorizer__decode_error: strict
columntransformer__modus_operandi__countvectorizer__encoding: utf-8
columntransformer__modus_operandi__countvectorizer__input: content
columntransformer__modus_operandi__countvectorizer__max_df: 1.0
columntransformer__modus_operandi__countvectorizer__min_df: 1
columntransformer__modus_operandi__countvectorizer__token_pattern: (?u)\b\w\w+\b
decisiontreeclassifier__ccp_alpha: 0.0
decisiontreeclassifier__criterion: gini
decisiontreeclassifier__min_impurity_decrease: 0.0
decisiontreeclassifier__min_samples_leaf: 1
decisiont

In [None]:
grid = dict(
    {
        'decisiontreeclassifier__ccp_alpha': [0.0, 0.01],
        'decisiontreeclassifier__criterion': ['gini', 'entropy'],
        'decisiontreeclassifier__min_samples_split': [2, 5],
        'decisiontreeclassifier__min_samples_leaf': [2, 4],
        'decisiontreeclassifier__splitter': ['best', 'random']
    }
)

In [None]:
# hyperparameters tuning for onehot encoding
grid = dict(
    {
        'decisiontreeclassifier__splitter': ['best', 'random']
    }
)

pipe_cv = GridSearchCV(pipe, grid, cv=3, verbose=1, n_jobs=-1)
pipe_cv.fit(X_train, y_train_encoded)
print(f"Best score: {pipe_cv.best_score_}")
for hp, val in pipe_cv.best_params_.items():
    print(f"{hp}: {val}")

# evaluation after tuning
y_pred = pipe_cv.predict(X_test)
print(classification_report(y_test_encoded, y_pred))
accuracy = accuracy_score(y_test_encoded, y_pred)
print(accuracy)

Fitting 3 folds for each of 2 candidates, totalling 6 fits




Best score: 0.938375026860626
decisiontreeclassifier__ccp_alpha: 0.0
              precision    recall  f1-score   support

           0       0.47      0.59      0.53        32
           1       0.80      0.77      0.78       374
           2       0.91      0.90      0.91       267
           3       0.31      0.26      0.28        35
           4       0.98      0.98      0.98      2303
           5       0.93      0.93      0.93       989

    accuracy                           0.94      4000
   macro avg       0.73      0.74      0.73      4000
weighted avg       0.93      0.94      0.94      4000

0.9355




In [None]:
# hyperparameters tuning for label encoding
grid = dict(
    {
        'decisiontreeclassifier__min_samples_leaf': [3, 4],
        'decisiontreeclassifier__splitter': ['best', 'random']
    }
)

pipe_cv = GridSearchCV(pipe, grid, cv=3, verbose=1, n_jobs=-1)
pipe_cv.fit(X_train, y_train_encoded)
print(f"Best score: {pipe_cv.best_score_}")
for hp, val in pipe_cv.best_params_.items():
    print(f"{hp}: {val}")

# evaluation after tuning
y_pred = pipe_cv.predict(X_test)
print(classification_report(y_test_encoded, y_pred))
accuracy = accuracy_score(y_test_encoded, y_pred)
print(accuracy)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best score: 0.9355628197890978
decisiontreeclassifier__min_samples_leaf: 4
decisiontreeclassifier__splitter: random
              precision    recall  f1-score   support

           0       0.46      0.59      0.52        32
           1       0.80      0.80      0.80       374
           2       0.91      0.87      0.89       267
           3       0.29      0.23      0.25        35
           4       0.98      0.99      0.98      2303
           5       0.92      0.93      0.92       989

    accuracy                           0.94      4000
   macro avg       0.73      0.74      0.73      4000
weighted avg       0.94      0.94      0.94      4000

0.93675


Accuracy with OneHot encoding after hyper parameters tuning: 93.55%

Accuracy with Label encoding after hyper parameters tuning: 93.675%