**Table of contents**<a id='toc0_'></a>    
- [Libraries and Data](#toc1_)    
- [Data Preparation](#toc2_)    
  - [Data Cleaning](#toc2_1_)    
  - [Feature Engineering](#toc2_2_)    
  - [X and y](#toc2_3_)    
  - [Encoding and scaling](#toc2_4_)    
- [Dummy Model](#toc3_)    
- [Logistic Regression](#toc4_)    
- [Support Vector Machine](#toc5_)    
- [K-Nearest Neighbours](#toc6_)    
- [Baseline Model  - Logistic Regression](#toc7_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Libraries and Data](#toc0_)

In [2]:
import pandas as pd
import os
import sys
import numpy as np

# Data cleaning
#from library import la_functions as la

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

# Modelling
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Export model
import pickle

# Path to the folder containing the pesonalized functions
folder_path = os.path.abspath(os.path.join('..', 'library'))
sys.path.insert(0, folder_path)

# Now you can import your module or functions
import la_functions as la

In [3]:
# fetch all the data from the raw_data folder
df = pd.read_csv('../raw_data/data.csv')

# <a id='toc2_'></a>[Data Preparation](#toc0_)

## <a id='toc2_1_'></a>[Data Cleaning](#toc0_)

In [4]:
# Remove victim_sex rows with missing data
col_remove_sex = ['X', '-']
df = df[~df['victim_sex'].isin(col_remove_sex)]
df = df[df['victim_sex'].notnull()]
len(df)


664923

In [5]:
# Remove victim_descent rows with missing data
df = df[df['victim_descent'].notnull()]
col_remove_descent = ['X', '-']
df = df[~df['victim_descent'].isin(col_remove_descent)]
len(df)

656473

In [6]:
# Remove victim age = 0
df=df[df['victim_age']>=0]
len(df)

656450

In [24]:
# lower case
df.crime_description = df.crime_description.str.lower()

In [7]:
# Group weapons into type and prepare for encoding
df['weapon_description'].fillna('None', inplace=True)
weapon_types = {
    'STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)': 'Physical Force',
    'UNKNOWN WEAPON/OTHER WEAPON': 'Unknown',
    'VERBAL THREAT': 'Verbal Threat',
    'HAND GUN': 'Firearm',
    'SEMI-AUTOMATIC PISTOL': 'Firearm',
    'KNIFE WITH BLADE 6INCHES OR LESS': 'Knife',
    'UNKNOWN FIREARM': 'Firearm',
    'OTHER KNIFE': 'Knife',
    'MACE/PEPPER SPRAY': 'Chemical',
    'VEHICLE': 'Vehicle',
    'ROCK/THROWN OBJECT': 'Thrown Object',
    'PIPE/METAL PIPE': 'Blunt Object',
    'BOTTLE': 'Blunt Object',
    'STICK': 'Blunt Object',
    'FOLDING KNIFE': 'Knife',
    'CLUB/BAT': 'Blunt Object',
    'KITCHEN KNIFE': 'Knife',
    'AIR PISTOL/REVOLVER/RIFLE/BB GUN': 'Firearm',
    'KNIFE WITH BLADE OVER 6 INCHES IN LENGTH': 'Knife',
    'BLUNT INSTRUMENT': 'Blunt Object',
    'HAMMER': 'Blunt Object',
    'SIMULATED GUN': 'Firearm',
    'REVOLVER': 'Firearm',
    'MACHETE': 'Knife',
    'OTHER FIREARM': 'Firearm',
    'OTHER CUTTING INSTRUMENT': 'Knife',
    'PHYSICAL PRESENCE': 'Physical Force',
    'UNKNOWN TYPE CUTTING INSTRUMENT': 'Knife',
    'SCREWDRIVER': 'Sharp Object',
    'CONCRETE BLOCK/BRICK': 'Blunt Object',
    'FIRE': 'Fire',
    'BELT FLAILING INSTRUMENT/CHAIN': 'Blunt Object',
    'SCISSORS': 'Sharp Object',
    'RIFLE': 'Firearm',
    'FIXED OBJECT': 'Blunt Object',
    'STUN GUN': 'Electric Weapon',
    'GLASS': 'Sharp Object',
    'AXE': 'Sharp Object',
    'BOARD': 'Blunt Object',
    'SHOTGUN': 'Firearm',
    'CAUSTIC CHEMICAL/POISON': 'Chemical',
    'SWITCH BLADE': 'Knife',
    'BRASS KNUCKLES': 'Blunt Object',
    'BOMB THREAT': 'Explosive',
    'TOY GUN': 'Firearm',
    'TIRE IRON': 'Blunt Object',
    'SCALDING LIQUID': 'Chemical',
    'SWORD': 'Sharp Object',
    'RAZOR BLADE': 'Sharp Object',
    'HECKLER & KOCH 93 SEMIAUTOMATIC ASSAULT RIFLE': 'Firearm',
    'DIRK/DAGGER': 'Knife',
    'EXPLOXIVE DEVICE': 'Explosive',
    'ASSAULT WEAPON/UZI/AK47/ETC': 'Firearm',
    'DEMAND NOTE': 'Threat',
    'ICE PICK': 'Sharp Object',
    'RAZOR': 'Sharp Object',
    'LIQUOR/DRUGS': 'Chemical',
    'SEMI-AUTOMATIC RIFLE': 'Firearm',
    'DOG/ANIMAL (SIC ANIMAL ON)': 'Animal',
    'ROPE/LIGATURE': 'Strangling',
    'STARTER PISTOL/REVOLVER': 'Firearm',
    'CLEAVER': 'Knife',
    'BOWIE KNIFE': 'Knife',
    'SAWED OFF RIFLE/SHOTGUN': 'Firearm',
    'AUTOMATIC WEAPON/SUB-MACHINE GUN': 'Firearm',
    'BOW AND ARROW': 'Projectile',
    'SYRINGE': 'Sharp Object',
    'STRAIGHT RAZOR': 'Sharp Object',
    'MARTIAL ARTS WEAPONS': 'Physical Force',
    'UNK TYPE SEMIAUTOMATIC ASSAULT RIFLE': 'Firearm',
    'BLACKJACK': 'Blunt Object',
    'RELIC FIREARM': 'Firearm',
    'ANTIQUE FIREARM': 'Firearm',
    'UZI SEMIAUTOMATIC ASSAULT RIFLE': 'Firearm',
    'MAC-11 SEMIAUTOMATIC ASSAULT WEAPON': 'Firearm',
    'MAC-10 SEMIAUTOMATIC ASSAULT WEAPON': 'Firearm',
    'HECKLER & KOCH 91 SEMIAUTOMATIC ASSAULT RIFLE': 'Firearm',
    'M1-1 SEMIAUTOMATIC ASSAULT RIFLE': 'Firearm',
    'M-14 SEMIAUTOMATIC ASSAULT RIFLE': 'Firearm',
    'None': 'None'
}

# Creating a new column 'weapon_type' based on the mapping
df['weapon_type'] = df['weapon_description'].map(weapon_types)

# Additional engineering could include severity
df.weapon_type.value_counts()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weapon_description'].fillna('None', inplace=True)


weapon_type
None               379385
Physical Force     153385
Firearm             32343
Unknown             27900
Verbal Threat       20521
Knife               18939
Blunt Object        12444
Chemical             3556
Vehicle              2766
Sharp Object         2138
Thrown Object        2075
Fire                  428
Electric Weapon       317
Explosive             123
Strangling             44
Animal                 43
Threat                 25
Projectile             18
Name: count, dtype: int64

In [8]:
# Drop null values in Premise code
df = df.dropna(subset=['premise_code'])

## <a id='toc2_2_'></a>[Feature Engineering](#toc0_)

In [9]:
# Split 'date_occurred' into year, month,  day, hour and drop date occurred
df['date_occurred'] = pd.to_datetime(df['date_occurred'])
df['year_occurred'] = df['date_occurred'].dt.year
df['month_occurred'] = df['date_occurred'].dt.month
df['day_occurred'] = df['date_occurred'].dt.day
df['hour_occurred'] = df['date_occurred'].dt.hour

In [10]:
# Create new column 'time_of_day' based on time and drop hour occurred
def categorize_time(hour):
    if 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    elif 18 <= hour < 24:
        return 'evening'
    else:
        return 'night'

df['time_of_day'] = df['hour_occurred'].apply(categorize_time)

In [11]:
# Join dataset with neighborhoods shp file
current_dir = os.getcwd()
shp_file_path = os.path.join(current_dir, '..', 'data','geo_data','cfbcc20d-2c5d-4c30-9dfa-627d46ec1a742020328-1-9ulknm.pzqsm.shp')

neighborhoods = gpd.read_file(shp_file_path)

crime_data_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
crime_data_gdf.set_crs(neighborhoods.crs, inplace=True).head(1)

joined_gdf = gpd.sjoin(crime_data_gdf, neighborhoods, how='left', op='within')

  if await self.run_code(code, result, async_=asy):


In [52]:
joined_gdf.head()

Unnamed: 0,division_number,date_reported,date_occurred,area,area_name,reporting_district,part,crime_code,crime_description,modus_operandi,...,year_occurred,month_occurred,day_occurred,hour_occurred,time_of_day,crime_gravity,geometry,index_right,OBJECTID,name
0,10304468,2020-01-08,2020-01-08 22:30:00,3,Southwest,377,2,624,battery - simple assault,0444 0913,...,2020,1,8,22,evening,1,POINT (-118.29780 34.01410),31.0,32.0,Exposition Park
1,190101086,2020-01-02,2020-01-01 03:30:00,1,Central,163,2,624,battery - simple assault,0416 1822 1414,...,2020,1,1,3,night,1,POINT (-118.25450 34.04590),23.0,24.0,Downtown
3,191501505,2020-01-01,2020-01-01 17:30:00,15,N Hollywood,1543,2,745,vandalism - misdeameanor ($399 or under),0329 1402,...,2020,1,1,17,afternoon,1,POINT (-118.40190 34.16850),95.0,96.0,Valley Village
5,200100501,2020-01-02,2020-01-01 00:30:00,1,Central,163,1,121,"rape, forcible",0413 1822 1262 1415,...,2020,1,1,0,night,3,POINT (-118.25340 34.04520),23.0,24.0,Downtown
6,200100502,2020-01-02,2020-01-02 13:15:00,1,Central,161,1,442,shoplifting - petty theft ($950 & under),1402 2004 0344 0387,...,2020,1,2,13,afternoon,1,POINT (-118.26310 34.04830),23.0,24.0,Downtown


## <a id='toc2_3_'></a>[X and y](#toc0_)

In [12]:
df.area.head()

0     3
1     1
3    15
5     1
6     1
Name: area, dtype: int64

In [52]:
# Features that lack relevance
irrelevant_cols = df[['division_number', 'date_reported', 'reporting_district', 'part', 'modus_operandi', 'status', 'status_description', 'crime_code_1',
       'crime_code_2', 'crime_code_3', 'crime_code_4','cross_street']]

# Features that are duplicates of numerical features (e.g. description vs code))
cat_duplicates = df[['area_name', 'crime_description', 'premise_description', 'weapon_description', 'location', 'latitude', 'longitude']]
num_duplicates = df[['weapon_code', 'date_occurred', 'hour_occurred']]

# Features not realistic to include
unrealistic_cols = df[['crime_code', 'premise_code', 'weapon_type']]

# Features requiring more work to integrate into API and front-end
more_work = df[['victim_descent','time_of_day']]

X = df[['area', 'victim_age', 'victim_sex', 'year_occurred',
       'month_occurred', 'day_occurred']]


- ***Is including crime code data leakage?***

In [40]:
def assign_gravity(crime_description):
    if any(word in crime_description for word in ['petty theft', 'vandalism', 'minor fraud', 'trespass','stole']):
        return 1  # Low Gravity
    elif any(word in crime_description for word in ['burglary', 'serious fraud', 'aggravated assault', 'robbery']):
        return 2  # Medium Gravity
    elif any(word in crime_description for word in ['homicide', 'rape', 'kidnapping', 'arson','dead','penetration','penis','child pornography']):
        return 3  # High Gravity
    else:
        return 1  # Default to Low Gravity if not clearly fitting other categories


df['crime_gravity'] = df.crime_description.apply(assign_gravity)
y = df['crime_gravity']

In [26]:
y.value_counts()

crime_gravity
1    464609
2    180867
3     10973
Name: count, dtype: int64

## <a id='toc2_4_'></a>[Encoding and scaling](#toc0_)

In [53]:
num_values = X.select_dtypes(include=['number'])
cat_values = X.select_dtypes(include=['object'])

In [54]:
X_cat_encoded = pd.get_dummies(X, columns=cat_values.columns)
X_cat_encoded = X_cat_encoded.replace({True: 1, False: 0})
X_cat_encoded.head()

  X_cat_encoded = X_cat_encoded.replace({True: 1, False: 0})


Unnamed: 0,area,victim_age,year_occurred,month_occurred,day_occurred,victim_sex_F,victim_sex_H,victim_sex_M
0,3,36,2020,1,8,1,0,0
1,1,25,2020,1,1,0,0,1
3,15,76,2020,1,1,1,0,0
5,1,25,2020,1,1,1,0,0
6,1,23,2020,1,2,0,0,1


In [55]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_cat_encoded)
X_scaled_df = pd.DataFrame(X_scaled, columns = X_cat_encoded.columns)


# <a id='toc3_'></a>[Dummy Model](#toc0_)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [19]:
dummy_model = DummyClassifier(strategy='uniform')
dummy_model.fit(X_train, y_train)
y_pred_dummy = dummy_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred_dummy))

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00    131290

    accuracy                           1.00    131290
   macro avg       1.00      1.00      1.00    131290
weighted avg       1.00      1.00      1.00    131290



In [None]:
X_scaled.shape, y.shape

((656449, 52), (656449,))

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [38]:
#undersampler =  RandomUnderSampler(sampling_strategy='auto', random_state=42)
oversampler =  SMOTE(sampling_strategy='auto', random_state=42)

#X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)


In [39]:
y_train.value_counts()

crime_gravity
1    371558
2    144849
3      8752
Name: count, dtype: int64

In [40]:
y_train_resampled.value_counts()

crime_gravity
2    371558
1    371558
3    371558
Name: count, dtype: int64

# <a id='toc4_'></a>[Logistic Regression](#toc0_)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

In [57]:
lr_model = LogisticRegression(max_iter=5000)

In [73]:
cv_results = cross_validate(lr_model, X_scaled, y, cv=5)

accuracy = cv_results['test_score'].mean()
accuracy

KeyboardInterrupt: 

In [58]:
lr_model.fit(X_train, y_train)
lr_model.score(X_train, y_train)

0.7075152477630584

In [47]:
y_pred_lr = lr_model.predict(X_test)

In [48]:
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.71      1.00      0.83     93051
           2       0.00      0.00      0.00     36018
           3       0.00      0.00      0.00      2221

    accuracy                           0.71    131290
   macro avg       0.24      0.33      0.28    131290
weighted avg       0.50      0.71      0.59    131290

Confusion Matrix:
[[93051     0     0]
 [36018     0     0]
 [ 2221     0     0]]


In [50]:
X_train.columns

Index(['area', 'victim_age', 'year_occurred', 'month_occurred', 'day_occurred',
       'hour_occurred', 'victim_sex_F', 'victim_sex_H', 'victim_sex_M'],
      dtype='object')

In [65]:
with open("../models/lr_model_2.pkl", "wb") as file:
    pickle.dump(lr_model, file)

### Mapping dictionaries
- To be used to map values back to simple features from OHE columns 

In [85]:
unique_victim_sex_values = df['victim_sex'].unique()
victim_sex_mapping = {}

for value in unique_victim_sex_values:
    one_hot_encoded = [0] * len(unique_victim_sex_values)
    one_hot_encoded[list(unique_victim_sex_values).index(value)] = 1
    victim_sex_mapping[value] = one_hot_encoded
victim_sex_mapping

{'F': [1, 0, 0], 'M': [0, 1, 0], 'H': [0, 0, 1]}

### Grid Search

In [28]:
# Grid Search for best parameters
param_grid = {
    'C': [0.1, 1.0, 10.0],
    'max_iter': [1000, 5000, 10000]
}

scoring = 'accuracy'

lr_model = LogisticRegression()

grid_search = GridSearchCV(lr_model, param_grid, scoring=scoring, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_logreg = LogisticRegression(**best_params)
best_logreg.fit(X_train, y_train)

# Evaluate performance on test set
test_accuracy = best_logreg.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters: {'C': 10.0, 'max_iter': 100}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Accuracy: 0.9213116002742021


**Observations**
- Accuracy is high and model correctly predicts majority of instances across classes
- Precision, recall and F1 scores high for class 1 but begin to drop for class 2. Recall and f1 very low for class 3.
- The model struggles to predict the most serious crimes, which is borne out in the high number of both false pos and negs in the confusion matrix. This is most likely because of considerable dataset imbalance

# <a id='toc5_'></a>[Support Vector Machine](#toc0_)


- More robust to overfitting. Have used C=1 for this example
- Works better with imbalanced datasets
- More effective with smaller datasets
- Model training on 10,000 took 1h+, reduced n iterations to '2000' however model DID/ DID NOT converge

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
undersampler =  RandomUnderSampler(sampling_strategy='auto', random_state=42)
oversampler =  SMOTE(sampling_strategy='auto', random_state=42)

X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_resampled, y_train_resampled)

In [35]:
svm_model = SVC(kernel='linear', max_iter=3000)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)



In [36]:
if (svm_model.n_iter_ < 3000).any():
    print("Optimization converged after", svm_model.n_iter_, "iterations.")
else:
    print("Optimization did not converge within the specified maximum number of iterations.")

Optimization did not converge within the specified maximum number of iterations.


In [37]:
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

Classification Report:
              precision    recall  f1-score   support

           1       0.70      0.38      0.49     93051
           2       0.28      0.39      0.33     36018
           3       0.01      0.18      0.02      2221

    accuracy                           0.37    131290
   macro avg       0.33      0.31      0.28    131290
weighted avg       0.57      0.37      0.44    131290

Confusion Matrix:
[[34901 33933 24217]
 [14197 13891  7930]
 [  694  1134   393]]


# <a id='toc6_'></a>[K-Nearest Neighbours](#toc0_)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

In [64]:
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

Classification Report:


              precision    recall  f1-score   support

           1       0.71      0.94      0.81     93051
           2       0.37      0.08      0.14     36018
           3       0.44      0.01      0.01      2221

    accuracy                           0.69    131290
   macro avg       0.51      0.35      0.32    131290
weighted avg       0.61      0.69      0.61    131290

Confusion Matrix:
[[87855  5186    10]
 [32959  3051     8]
 [ 2104   103    14]]


# <a id='toc7_'></a>[Baseline Model  - Logistic Regression](#toc0_)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# DATA CLEANING PIPES
def isolate_age(X):
    return X.apply(lambda age: age if 1 <= age <= 99 else None)

def dropna(X):
    return X.dropna()

# Remove null values from Age
dropna_pipe = FunctionTransformer(dropna)

# select valid age range
age_range_pipe = FunctionTransformer(isolate_age)


In [37]:
# Preprocess numerical data
norm_scaler = MinMaxScaler()
preprocessor = ColumnTransformer(
    transformers=[
        #('dropna_pipe', dropna_pipe, ['victim_age']),
        #('age_range_pipe', age_range_pipe, ['victim_age']),
        ('num', norm_scaler, ['victim_age', 'latitude', 'longitude', 'day_occurred', 'month_occurred', 'year_occurred'])
    ])

In [38]:
# Logistic Regression model
lr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])