## Import libraries

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

### Load dataset 

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
print(f'Train: {train.shape}')
print(f'Test {test.shape}')

Train: (20000, 22)
Test (5000, 21)


## Preprocess

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
columns_to_encode = ['Crime_Category', 'Victim_Sex', 'Victim_Descent', 'Status']

encoded_data = {col: le.fit_transform(train[col]) for col in columns_to_encode}
e_train = train.drop(columns=columns_to_encode).assign(**encoded_data)

In [5]:
train = train.replace({None: np.nan, "": np.nan})
train = train.fillna(value=0)

test = test.replace({None: np.nan, "": np.nan})
test = test.fillna(value=0)

## Preprocess

In [6]:
train['Victim_Sex'] = train['Victim_Sex'].apply(lambda x: "Unknown" if x == 0 else x)
train['Victim_Descent'] = train['Victim_Descent'].apply(lambda x: "Unknown" if x == 0 else x)

test['Victim_Sex'] = test['Victim_Sex'].apply(lambda x: "Unknown" if x == 0 else x)
test['Victim_Descent'] = test['Victim_Descent'].apply(lambda x: "Unknown" if x == 0 else x)

In [7]:
train.drop(columns=['Location', 'Area_Name', 'Premise_Description', 'Weapon_Description', 'Status_Description'], axis=1, inplace=True)
test.drop(columns=['Location', 'Area_Name', 'Premise_Description', 'Weapon_Description', 'Status_Description'], axis=1, inplace=True)

In [8]:
from sklearn.pipeline import Pipeline, FunctionTransformer

date_format = "%m/%d/%Y %I:%M:%S %p"

def convert_date(x):
  x['Date_Reported'] = pd.to_datetime(x['Date_Reported'], format=date_format)
  x['Date_Occurred'] = pd.to_datetime(x['Date_Occurred'], format=date_format)
  return x

def create_report_delay(x):
  x['Report_delay'] = (x['Date_Reported'] - x['Date_Occurred']).dt.days
  return x

pipeline = Pipeline([
  ('convert_date', FunctionTransformer(convert_date, validate=False)),
  ('create_report_delay', FunctionTransformer(create_report_delay, validate=False))
])

train = pipeline.fit_transform(train)
test = pipeline.fit_transform(test)

In [9]:
date_features = {
    'Day': 'day',
    'Month': 'month',
    'Year': 'year',
    'DOY': 'day_of_year',
    'Week': 'weekday'
}

def extract_date_features(df, date_columns):
    for date_col in date_columns:
        for feature, attr in date_features.items():
            df[f"{feature}_{date_col.split('_')[1]}"] = getattr(df[date_col].dt, attr)

extract_date_features(train, ['Date_Occurred', 'Date_Reported'])
extract_date_features(test, ['Date_Occurred', 'Date_Reported'])


In [10]:
train.drop(columns=['Date_Reported','Date_Occurred'], inplace=True)
test.drop(columns=['Date_Reported','Date_Occurred'], inplace=True)

In [11]:
def hour(time):
  time_str = f"{time:04d}"
  hours = time_str[:2]
  return hours

def minute(time):
  time_str = f"{time:04d}"
  minute = time_str[2:]
  return minute

In [12]:
def process_time_column(df, time_col):
    df[time_col] = df[time_col].astype(int)
    df['Hour_Occurred'] = df[time_col].apply(hour).astype(int)
    df['Minute_Occurred'] = df[time_col].apply(minute).astype(int)

process_time_column(train, 'Time_Occurred')
process_time_column(test, 'Time_Occurred')

In [13]:
train.drop(columns=['Time_Occurred'], inplace=True)
test.drop(columns=['Time_Occurred'], inplace=True)

In [14]:
train['Victim_Age'] = train['Victim_Age'].apply(lambda x: 0 if x < 0 else x)
test['Victim_Age'] = test['Victim_Age'].apply(lambda x: 0 if x < 0 else x)  

In [15]:
def convert_cs_col(df):
    df['Cross_Street'] = df['Cross_Street'].apply(lambda x: 1 if x != 0 else x)
    df['Cross_Street'] = df['Cross_Street'].astype(int)

convert_cs_col(train)
convert_cs_col(test)

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

num_cols = ['Cross_Street', 'Latitude', 'Longitude', 'Area_ID', 'Reporting_District_no','Victim_Age', 'Premise_Code',
            'Weapon_Used_Code','Report_delay','Day_Occurred','Month_Occurred', 'Year_Occurred',
            'DOY_Occurred', 'Week_Occurred','Day_Reported', 'Month_Reported', 'Year_Reported',
            'DOY_Reported','Week_Reported', 'Hour_Occurred', 'Minute_Occurred']

cat_cols = ['Part 1-2', 'Victim_Sex', 'Victim_Descent', 'Status']

col_trans = ColumnTransformer([
    ('scalar', MinMaxScaler(), num_cols),
    ('ohe', OneHotEncoder(), cat_cols)
])

In [17]:
train_transformed = col_trans.fit_transform(train)
col_names_train = col_trans.get_feature_names_out()

ccn_train = []
for col_name in col_names_train:
  if "__" in col_name:
    cn = col_name.split("__")[1]
    ccn_train.append(cn)
  else:
    ccn_train.append(col_name)  

train_transformed = pd.DataFrame(train_transformed, columns=ccn_train)
train = pd.concat([train.drop(columns=num_cols+cat_cols), train_transformed], axis=1)

In [18]:
test_transformed = col_trans.fit_transform(test)
col_names_test = col_trans.get_feature_names_out()

ccn_test = []
for col_name in col_names_test:
  if "__" in col_name:
    cn = col_name.split("__")[1]
    ccn_test.append(cn)
  else:
    ccn_test.append(col_name)

test_transformed = pd.DataFrame(test_transformed, columns=ccn_test)
test = pd.concat([test.drop(columns=num_cols+cat_cols), test_transformed], axis=1)

In [19]:
train['Modus_Operandi'] = train['Modus_Operandi'].astype(str)
train['Modus_Operandi'] = train['Modus_Operandi'].apply(lambda x: x.split())

test['Modus_Operandi'] = test['Modus_Operandi'].astype(str)
test['Modus_Operandi'] = test['Modus_Operandi'].apply(lambda x: x.split())

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

modus_encoded = mlb.fit_transform(train['Modus_Operandi'])
modus_encoded_train = pd.DataFrame(modus_encoded, columns=mlb.classes_)

modus_encoded_train.index = train.index
train = pd.concat([train, modus_encoded_train], axis=1)

me = mlb.fit_transform(test['Modus_Operandi'])
modus_encoded_test = pd.DataFrame(me, columns=mlb.classes_)

modus_encoded_test.index = test.index
test = pd.concat([test, modus_encoded_test], axis=1)

In [21]:
train.drop(columns=['Modus_Operandi'], inplace=True)
test.drop(columns=['Modus_Operandi'], inplace=True)

In [25]:
all_columns = list(set(train.columns) | set(test.columns))

train = train.reindex(columns=all_columns, fill_value=0)
test = test.reindex(columns=all_columns, fill_value=0)

In [26]:
cols = train.columns
f_test = test[cols]

In [27]:
f_test.drop(columns=['Crime_Category'], inplace=True)

## Model building

In [28]:
x_train = train.drop(columns=['Crime_Category'])
y_train = train['Crime_Category']

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, random_state=0)
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Make submission

In [34]:
y_pred = lr.predict(f_test)

In [35]:
submission = pd.DataFrame({
                           "ID": np.arange(1,5001), 
                           "Crime_Category": y_pred,
                          }) 

submission.to_csv('../data/submission.csv',index=False)