## Model Training

##### Importing Libraries

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Feature Transformation
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

# Modelling
from sklearn.metrics import accuracy_score, f1_score
from catboost import CatBoostClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import warnings

In [2]:
df = pd.read_csv('crime.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


##### Preparing X and Y variables

In [4]:
df = df.drop(columns=['INCIDENT_NUMBER'], axis=1)

In [5]:
df['OCCURRED_ON_DATE'] = pd.to_datetime(df['OCCURRED_ON_DATE'])
print(df['OCCURRED_ON_DATE'].dtype)

datetime64[ns]


In [6]:
# Extract date features
df['YEAR'] = df['OCCURRED_ON_DATE'].dt.year
df['MONTH'] = df['OCCURRED_ON_DATE'].dt.month
df['DAY'] = df['OCCURRED_ON_DATE'].dt.day
df['HOUR'] = df['OCCURRED_ON_DATE'].dt.hour

In [7]:
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location,DAY
0,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)",2
1,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)",21
2,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)",3
3,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)",3
4,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)",3


In [8]:
# Transform location data into latitude and longitude features
df['LATITUDE'] = df['Location'].str.extract(r'\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)')[0].astype(float)
df['LONGITUDE'] = df['Location'].str.extract(r'\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)')[1].astype(float)

In [9]:
df.drop(columns=['Location', 'Lat', 'Long'], axis=1, inplace=True)
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE
0,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371
1,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603
2,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429
3,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664
4,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361


In [10]:
# Split the REPORTING_AREA feature into numeric and non-numeric components
df['REPORTING_AREA'] = df['REPORTING_AREA'].str.extract(r'(\d+)')[0].astype(float)
df['REPORTING_AREA_STR'] = df['REPORTING_AREA'].astype(str)
df.drop('REPORTING_AREA', axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,Larceny,LARCENY ALL OTHERS,D14,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,Vandalism,VANDALISM,C11,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,Towed,TOWED MOTOR VEHICLE,D4,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,Investigate Property,INVESTIGATE PROPERTY,D4,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,Investigate Property,INVESTIGATE PROPERTY,B3,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [12]:
df.drop(columns=['OFFENSE_DESCRIPTION', 'OCCURRED_ON_DATE'], axis=1,inplace=True)
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,Larceny,D14,,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,Vandalism,C11,,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,Towed,D4,,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,Investigate Property,D4,,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,Investigate Property,B3,,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [13]:
df.isna().sum()

OFFENSE_CODE               0
OFFENSE_CODE_GROUP         0
DISTRICT                1765
SHOOTING              318054
YEAR                       0
MONTH                      0
DAY_OF_WEEK                0
HOUR                       0
UCR_PART                  90
STREET                 10871
DAY                        0
LATITUDE                   0
LONGITUDE                  0
REPORTING_AREA_STR         0
dtype: int64

In [14]:
df['REPORTING_AREA_STR'].dtype

dtype('O')

In [15]:
len(df.columns)

14

In [16]:
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,Larceny,D14,,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,Vandalism,C11,,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,Towed,D4,,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,Investigate Property,D4,,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,Investigate Property,B3,,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [17]:
# Define categorical and numeric features
categorical_columns = ['DAY_OF_WEEK', 'DISTRICT', 'UCR_PART', 'STREET','DAY_OF_WEEK','REPORTING_AREA_STR']
numerical_columns = ['LATITUDE', 'LONGITUDE', 'YEAR', 'MONTH', 'DAY', 'HOUR']

# Define preprocessing steps for categorical features
cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder", OrdinalEncoder()),
        ("scaler", StandardScaler(with_mean=False))
    ]
)

# Define preprocessing steps for numeric features
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('standard_scaler', StandardScaler())
    ]
)

# Define column transformer to apply preprocessing steps to each feature type
preprocessor = ColumnTransformer(transformers=[
    ('num_pipeline', num_transformer, numerical_columns),
    ('cat_pipeline', cat_transformer, categorical_columns)
])

# Extract target variable
target_variable = df['OFFENSE_CODE_GROUP']
# Drop target variable from the DataFrame
df = df.drop(columns=['OFFENSE_CODE_GROUP'])

# Apply preprocessing steps to remaining features
X = preprocessor.fit_transform(df)

# Your preprocessed data is now in the variable X and your target variable is in target_variable


In [18]:
label_encoder = LabelEncoder()
target_variable = label_encoder.fit_transform(target_variable)
target_variable

array([34, 63, 62, ..., 66, 66, 66])

In [19]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,target_variable,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((255258, 12), (63815, 12))

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
def evaluate_models(X_train, y_train, X_test, y_test, models, param, scoring_function):
    scoring = make_scorer(scoring_function, average='weighted')
    report = {}

    for name, model in models.items():
        gs = GridSearchCV(model, param[name], cv=3, scoring=scoring, n_jobs=-1, verbose=3, return_train_score=True, refit=True, error_score='raise', pre_dispatch='2*n_jobs')
        gs.fit(X_train, y_train)
        best_model = gs.best_estimator_
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        score = scoring_function(y_test, y_pred)
        report[name] = score

    return report
    

In [22]:
models = {
                "Random Forest": RandomForestClassifier(),
                "Decision Tree": DecisionTreeClassifier(),
                "Gradient Boosting": GradientBoostingClassifier(),
                "Logistic Regression": LogisticRegression(),
                "XGBClassifier": XGBClassifier(),
                "CatBoosting Classifier": CatBoostClassifier(verbose=False),
                "AdaBoost Classifier": AdaBoostClassifier(),
            }

params={
    "Decision Tree": {
        'criterion':['gini', 'entropy']
    },
    "Random Forest":{
        'n_estimators': [8,16,32,64,128,256]
    },
    "Gradient Boosting":{
        'learning_rate':[.1,.01,.05,.001]
    },
    "Logistic Regression":{
        'C': [0.01, 0.1, 1, 10]
    },
    "XGBClassifier":{
        'learning_rate':[.1,.01,.05,.001],
    },
    "CatBoosting Classifier":{
        'depth': [6,8,10]
    },
    "AdaBoost Classifier":{
        'learning_rate':[.1,.01,0.5,.001]
    }   
}

model_report:dict=evaluate_models(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test,
    models=models,
    param=params,
    scoring_function=f1_score
    )
            
## To get best model score from dict
best_model_score = max(sorted(model_report.values()))

## To get best model name from dict
best_model_name = list(model_report.keys())[
    list(model_report.values()).index(best_model_score)
]
best_model = models[best_model_name]

print(f"Best model is {best_model} and the score is {best_model_score} ")

Fitting 3 folds for each of 6 candidates, totalling 18 fits




[CV 1/3] END n_estimators=8;, score=(train=0.952, test=0.268) total time=   6.0s
[CV 3/3] END n_estimators=8;, score=(train=0.953, test=0.265) total time=   6.2s
[CV 2/3] END n_estimators=8;, score=(train=0.953, test=0.266) total time=   6.0s
[CV 3/3] END n_estimators=16;, score=(train=0.972, test=0.280) total time=  12.2s
[CV 1/3] END n_estimators=16;, score=(train=0.971, test=0.284) total time=  12.5s
[CV 2/3] END n_estimators=16;, score=(train=0.971, test=0.281) total time=  12.4s
[CV 1/3] END n_estimators=32;, score=(train=0.973, test=0.291) total time=  24.0s
[CV 2/3] END n_estimators=32;, score=(train=0.973, test=0.289) total time=  24.7s
[CV 3/3] END n_estimators=32;, score=(train=0.974, test=0.290) total time=  25.3s
