# Importing Libraries

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import regex as re
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder

from sklearn.dummy import DummyRegressor,DummyClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.model_selection import train_test_split,RandomizedSearchCV

from xgboost import XGBClassifier

import joblib

In [30]:
train_data=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/train.csv")
test_data=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")
train_data.head()

Unnamed: 0,Location,Cross_Street,Latitude,Longitude,Date_Reported,Date_Occurred,Time_Occurred,Area_ID,Area_Name,Reporting_District_no,...,Victim_Age,Victim_Sex,Victim_Descent,Premise_Code,Premise_Description,Weapon_Used_Code,Weapon_Description,Status,Status_Description,Crime_Category
0,4500 CARPENTER AV,,34.1522,-118.391,03/09/2020 12:00:00 AM,03/06/2020 12:00:00 AM,1800.0,15.0,N Hollywood,1563.0,...,75.0,M,W,101.0,STREET,,,IC,Invest Cont,Property Crimes
1,45TH ST,ALAMEDA ST,34.0028,-118.2391,02/27/2020 12:00:00 AM,02/27/2020 12:00:00 AM,1345.0,13.0,Newton,1367.0,...,41.0,M,H,216.0,SWAP MEET,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,Property Crimes
2,600 E MARTIN LUTHER KING JR BL,,34.0111,-118.2653,08/21/2020 12:00:00 AM,08/21/2020 12:00:00 AM,605.0,13.0,Newton,1343.0,...,67.0,M,B,501.0,SINGLE FAMILY DWELLING,,,IC,Invest Cont,Property Crimes
3,14900 ORO GRANDE ST,,34.2953,-118.459,11/08/2020 12:00:00 AM,11/06/2020 12:00:00 AM,1800.0,19.0,Mission,1924.0,...,61.0,M,H,101.0,STREET,,,IC,Invest Cont,Property Crimes
4,7100 S VERMONT AV,,33.9787,-118.2918,02/25/2020 12:00:00 AM,02/25/2020 12:00:00 AM,1130.0,12.0,77th Street,1245.0,...,0.0,X,X,401.0,MINI-MART,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,Property Crimes


In [None]:
train_data.info()

In [None]:
train_data.shape

# Visualization of Data

In [None]:
train_data['Crime_Category'].unique()

In [None]:
df=pd.DataFrame(train_data)
crime_counts=df["Crime_Category"].value_counts()
print(crime_counts)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=df,x='Crime_Category',order=crime_counts.index)
plt.title('Distribution of Crime Categories')
plt.xlabel('Crime Category')
plt.ylabel('Count')
plt.xticks(rotation=15)
plt.show()

#### Crime Category is **Rigth Skewed** where Property crimes ocurring most frequently

In [None]:
numerical_cols = ['Latitude', 'Longitude', 'Time_Occurred', 'Victim_Age', 'Premise_Code']
df[numerical_cols].hist(figsize=(15, 10), bins=30, layout=(3, 2))
plt.show()

In [None]:
# Plotting the gender distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Victim_Sex')
plt.title('Gender Distribution of Victims')
plt.xlabel('Victim Sex')
plt.ylabel('Count')
plt.show()


#### Male commiting more crimes while H has very low crime rate

In [None]:
# Plotting the number of crimes by gender and category
plt.figure(figsize=(14, 8))
sns.countplot(data=df, x='Crime_Category', hue='Victim_Sex')
plt.title('Crime Category by Gender')
plt.xlabel('Crime Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Victim Sex')
plt.show()


In [None]:
# Plotting the number of crimes in different areas
plt.figure(figsize=(14, 8))
sns.countplot(data=df, y='Area_Name', order=df['Area_Name'].value_counts().index)
plt.title('Number of Crimes by Area Name')
plt.xlabel('Count')
plt.ylabel('Area Name')
plt.show()


In [None]:
# distribution of victim age
sns.histplot(train_data['Victim_Age'], bins=30, kde=True, color='green')
plt.title('Distribution of Victim Age')
plt.xlabel('Victim Age')
plt.ylabel('Count')


#### Victom Age is of normal distribution with maximum of people comiting crime in their middle age

# Data Preprocessing

In [None]:
train_data.info()

In [None]:
#find columns with null values
df.isna().sum()

In [None]:
#total no of null values
df.isna().sum().sum()

In [None]:
#Separating numerical and categorical categories
numerical=df.select_dtypes(include=['float64']).columns
categorical=df.select_dtypes(include=['object']).columns
print("Numerical Columns: ",numerical,"\n Categrocal Columns: ",categorical)

In [None]:
#categorical columns with null values
catNull=['Cross_Street','Modus_Operandi','Victim_Descent','Victim_Sex','Weapon_Description']
numNull=['Weapon_Used_Code']

In [None]:
print("Numbe of unqiue values in each null categorical columns:")
print('Cross_Street',len(df['Cross_Street'].unique()))
print('Modus_Operandi',len(df['Modus_Operandi'].unique()))
print('Victim_Descent',len(df['Victim_Descent'].unique()))
print('Victim_Sex',len(df['Victim_Sex'].unique()))
print('Weapon_Description',len(df['Weapon_Description'].unique()))

In [None]:
#Preprocessing
numeric_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='medain')),
    ('scaler',StandardScaler())
])

categorical_transformer=Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3)),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_transformer,numerical),
    ('cat',categorical_transformer,categorical)
]
)

### BaseLine Model

In [None]:
# Convert to DataFrame
df = pd.DataFrame(train_data)



In [None]:
# Encode categorical features (simplified)
df['Victim_Sex'] = df['Victim_Sex'].astype('category').cat.codes
df['Victim_Descent'] = df['Victim_Descent'].astype('category').cat.codes
df['Status'] = df['Status'].astype('category').cat.codes



In [None]:
# Define features and target variable
X = df.drop(columns=['Crime_Category'])
y = df['Crime_Category']

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Dummy Classifier
dummy_clf = DummyClassifier(strategy="most_frequent")

In [None]:
# # Train the classifier
# dummy_clf.fit(X_train, y_train)

# # Make predictions
# y_pred = dummy_clf.predict(X_test)
# y_pred

In [None]:
# test_data.info()

In [None]:
# X_train = train_data.drop(columns=['Crime_Category'])
# y_train = train_data['Crime_Category']

# # Define features and target variable for testing
# X_test = test_data

In [None]:
# dummy_clf.fit(X_train, y_train)

In [None]:
# # Make predictions
# y_pred = dummy_clf.predict(X_test)

# y_pred.shape

In [None]:
# submission = pd.DataFrame({"ID": np.arange(1,5001), 
#                            "Crime_Category": y_pred,
#                           }
#                          ) 

# submission.to_csv('submission.csv',index=False)

### Logistic Regression

In [None]:
# separating feature and target columsn
X=train_data.drop(columns=['Crime_Category'],axis=1)
y=train_data['Crime_Category']

In [None]:
X.info()

In [None]:
X.isnull().sum()

In [None]:
X_copy=X.copy()
label = LabelEncoder()

In [None]:
#Splitting train and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [None]:
#Separating numerical and categorical categories
numerical=X.select_dtypes(include=['float64']).columns
categorical=X.select_dtypes(include=['object']).columns

In [None]:
print("Numerical Columns: ",numerical,"\n Categrocal Columns: ",categorical)

In [None]:
X.head()

In [None]:
X_copy['Cross_Street_Encoded'] = label.fit_transform(X_copy['Cross_Street'].astype(str))
X_copy['Location_Encoded'] = label.fit_transform(X_copy['Location'].astype(str))

In [None]:

imputed_df = X_copy[['Location_Encoded', 'Cross_Street_Encoded','Latitude', 'Longitude']]

imputer = KNNImputer(n_neighbors=3)
imputed_data = imputer.fit_transform(imputed_df)
imputed_df = pd.DataFrame(imputed_data, columns=['Location_Encoded', 'Cross_Street_Encoded','Latitude', 'Longitude'])
imputed_df['Cross_Street'] = label.inverse_transform(imputed_df['Cross_Street_Encoded'].round().astype(int))
imputed_df['Location'] = label.inverse_transform(imputed_df['Location_Encoded'].round().astype(int))
imputed_df.drop(columns=['Location_Encoded', 'Cross_Street_Encoded'],inplace=True)

In [None]:
imputed_df.info()

In [None]:
X['Cross_Street'] = imputed_df['Cross_Street'].copy()

In [None]:
#Preprocessing
numeric_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_transformer,numerical),
    ('cat',categorical_transformer,categorical)
]
)

In [None]:
model=Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression(max_iter=1000))
])

In [None]:
#train the model
model.fit(X_train,y_train)

In [None]:
#model evalutaion

train=model.score(X_train,y_train)
test=model.score(X_test,y_test)
print(f"Training Accuracy: {train:.4f}")
print(f"Testing Accuracy: {test:.4f}")

In [None]:
model.fit(X,y)

In [None]:
# Make predictions
predictions = model.predict(test_data)  #0.830

In [None]:
# submission = pd.DataFrame({
#     'ID': range(1, len(predictions) + 1),  # Using index as ID
#     'Crime_Category': predictions
# })

# submission.to_csv('submission.csv',index=False)

### KNN 

# XGBooster

In [32]:
# Distribution of the crime categories, crimes over time

# train.csv
# Specify the date format
date_format = '%m/%d/%Y %I:%M:%S %p'  # Month/Day/Year Hour:Minute:Second AM/PM

# train.csv
# convert date columns to datetime with specified format
train_data['Date_Reported'] = pd.to_datetime(train_data['Date_Reported'], format=date_format)
train_data['Date_Occurred'] = pd.to_datetime(train_data['Date_Occurred'], format=date_format)

# extract year, month, day, and hour
train_data['Year_Reported'] = train_data['Date_Reported'].dt.year
train_data['Month_Reported'] = train_data['Date_Reported'].dt.month
train_data['Day_Reported'] = train_data['Date_Reported'].dt.day
train_data['Hour_Occurred'] = train_data['Time_Occurred'].apply(lambda x: int(x // 100))

# test.csv
# convert date columns to datetime with specified format
test_data['Date_Reported'] = pd.to_datetime(test_data['Date_Reported'], format=date_format)
test_data['Date_Occurred'] = pd.to_datetime(test_data['Date_Occurred'], format=date_format)

# extract year, month, day, and hour
test_data['Year_Reported'] = test_data['Date_Reported'].dt.year
test_data['Month_Reported'] = test_data['Date_Reported'].dt.month
test_data['Day_Reported'] = test_data['Date_Reported'].dt.day
test_data['Hour_Occurred'] = test_data['Time_Occurred'].apply(lambda x: int(x // 100))

In [33]:
train_data['Cross_Street'] = train_data['Cross_Street'].fillna('Unknown')
train_data['Modus_Operandi'] = train_data['Modus_Operandi'].fillna('Unknown')
train_data['Victim_Sex'] = train_data['Victim_Sex'].fillna('Unknown')
train_data['Victim_Descent'] = train_data['Victim_Descent'].fillna('Unknown')
train_data['Premise_Description'] = train_data['Premise_Description'].fillna('Unknown')
train_data['Weapon_Description'] = train_data['Weapon_Description'].fillna('Unknown')

# test.csv
test_data['Cross_Street'] = test_data['Cross_Street'].fillna('Unknown')
test_data['Modus_Operandi'] = test_data['Modus_Operandi'].fillna('Unknown')
test_data['Victim_Sex'] = test_data['Victim_Sex'].fillna('Unknown')
test_data['Victim_Descent'] = test_data['Victim_Descent'].fillna('Unknown')
test_data['Premise_Description'] = test_data['Premise_Description'].fillna('Unknown')
test_data['Weapon_Description'] = test_data['Weapon_Description'].fillna('Unknown')

# fill missing numerical values with the median values
train_data['Weapon_Used_Code'] = train_data['Weapon_Used_Code'].fillna(train_data['Weapon_Used_Code'].median())
test_data['Weapon_Used_Code'] = test_data['Weapon_Used_Code'].fillna(train_data['Weapon_Used_Code'].median())

In [34]:
# verify for NaN values
print(f"Count of NaN values in 'train_csv': {train_data.isna().sum().sum()} values")
print(f"Count of NaN values in 'test_csv': {test_data.isna().sum().sum()} values")


Count of NaN values in 'train_csv': 0 values
Count of NaN values in 'test_csv': 0 values


In [35]:
# Encode categorical features

label_encoders = {}
categorical = ['Location', 'Cross_Street', 'Area_Name', 'Modus_Operandi', 
                           'Victim_Sex', 'Victim_Descent', 'Premise_Description', 
                           'Weapon_Description', 'Status', 'Status_Description']


for col in categorical:
    label = LabelEncoder()
    train_data[col] = label.fit_transform(train_data[col])
    
    # handle unseen labels in test data
    test_data[col] = test_data[col].map(lambda s: '<unknown>' if s not in label.classes_ else s)
    label.classes_ = np.append(label.classes_, '<unknown>')
    test_data[col] = label.transform(test_data[col])
    
    # store the label encoder
    label_encoders[col] = label

In [36]:
# encode the target variable 'Crime_Category'
label_crime_category = LabelEncoder()
train_data['Crime_Category'] = label_crime_category.fit_transform(train_data['Crime_Category'])

# save the label encoder for later decoding
joblib.dump(label_crime_category, 'label_crime_category.pkl')

['label_crime_category.pkl']

In [37]:
# Scale numerical features

scaler = StandardScaler()
numerical = ['Latitude', 'Longitude', 'Time_Occurred', 'Area_ID', 
                     'Reporting_District_no', 'Part 1-2', 'Victim_Age', 
                     'Premise_Code', 'Weapon_Used_Code']

train_data[numerical] = scaler.fit_transform(train_data[numerical])
test_data[numerical] = scaler.transform(test_data[numerical])

In [38]:
train_data['Crime_Category'].unique()

array([4, 5, 3, 1, 2, 0])

In [39]:
# define features and target variable for training data
X_train = train_data.drop(columns=['Crime_Category', 'Date_Reported', 'Date_Occurred'])
y_train = train_data['Crime_Category']

# prepare test data (assuming
X_test = test_data.drop(columns=['Date_Reported', 'Date_Occurred'])

# split the training data further into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape)

(16000, 23) (4000, 23) (16000,) (4000,)


In [40]:
# Function to evaluate models

def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    report = classification_report(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    return accuracy, report, conf_matrix

In [42]:
xgb_refined_params = {
    'n_estimators': [90, 100, 110],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [6, 7, 8],
    'min_child_weight': [4, 5, 6],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7]
}

grid_search_xgb_refined = RandomizedSearchCV(estimator=XGBClassifier(random_state=42), param_distributions=xgb_refined_params, cv=3, n_jobs=-1, verbose=3, random_state=42, n_iter=100)
grid_search_xgb_refined.fit(X_train_split, y_train_split)
best_xgb_refined_model = grid_search_xgb_refined.best_estimator_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 3/3] END colsample_bytree=0.7, learning_rate=0.1, max_depth=8, min_child_weight=5, n_estimators=90, subsample=0.9;, score=0.919 total time=   3.7s
[CV 1/3] END colsample_bytree=0.7, learning_rate=0.1, max_depth=8, min_child_weight=6, n_estimators=110, subsample=1.0;, score=0.912 total time=   4.2s
[CV 3/3] END colsample_bytree=0.7, learning_rate=0.1, max_depth=8, min_child_weight=6, n_estimators=110, subsample=1.0;, score=0.919 total time=   4.3s
[CV 1/3] END colsample_bytree=0.7, learning_rate=0.15, max_depth=7, min_child_weight=6, n_estimators=100, subsample=1.0;, score=0.915 total time=   3.5s
[CV 2/3] END colsample_bytree=0.7, learning_rate=0.15, max_depth=7, min_child_weight=6, n_estimators=100, subsample=1.0;, score=0.925 total time=   3.5s
[CV 3/3] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=6, n_estimators=90, subsample=1.0;, score=0.918 total time=   3.8s
[CV 1/3] END colsample_b

In [44]:
models = {
    'XGBoost Refined': best_xgb_refined_model,
}

In [45]:
# Evaluate all models
for name, model in models.items():
    accuracy, report, conf_matrix = evaluate_model(model, X_train_split, y_train_split, X_val_split, y_val_split)
    
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"{name} Classification Report:\n{report}")
    print(f"{name} Confusion Matrix:\n{conf_matrix}\n")

XGBoost Refined Accuracy: 0.9200
XGBoost Refined Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.53      0.58        32
           1       0.79      0.76      0.78       374
           2       0.93      0.86      0.89       267
           3       0.50      0.09      0.15        35
           4       0.97      0.96      0.96      2303
           5       0.87      0.94      0.91       989

    accuracy                           0.92      4000
   macro avg       0.78      0.69      0.71      4000
weighted avg       0.92      0.92      0.92      4000

XGBoost Refined Confusion Matrix:
[[  17    3    0    0    0   12]
 [   3  286   12    0   19   54]
 [   0   22  230    0   15    0]
 [   0   19    4    3    9    0]
 [   0   15    1    3 2213   71]
 [   7   16    0    0   35  931]]



In [46]:
# choose the best model and retrain on full training data
best_model = best_xgb_refined_model 
best_model.fit(X_train, y_train)
y_train_pred = best_model.predict(X_train)
print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred)}")

Training Accuracy: 0.9766


In [47]:
# predict on test data
y_test_pred = best_model.predict(X_test)

# decode the predictions for submission
labelE_crime_category = joblib.load('label_crime_category.pkl')
y_test_pred_decoded = label_crime_category.inverse_transform(y_test_pred)
y_test_pred_decoded

array(['Violent Crimes', 'Property Crimes', 'Crimes against Public Order',
       ..., 'Violent Crimes', 'Violent Crimes', 'Property Crimes'],
      dtype=object)

In [48]:
test_data.reset_index(drop=True, inplace=True)
submission = pd.DataFrame({'ID':test_data.index + 1, 'Crime_Category': y_test_pred_decoded})
submission.to_csv('submission.csv', index = False)
submission.head()

Unnamed: 0,ID,Crime_Category
0,1,Violent Crimes
1,2,Property Crimes
2,3,Crimes against Public Order
3,4,Property Crimes
4,5,Crimes against Public Order
