In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

In [None]:
!ls ../input/indian-crimes-dataset

# Problem Description
**This is a crime dataset of the types of crimes committed in India. The dataset has several variables like the reported cases, the dates and time the cases occured, demographic details like age, gender, location where the cases occured etc. I have tried to use this dataset to label cities in India as unsafe and safe using binary classification, detailing out some insights like the most vulnerable age groups, time of the days that are most unsafe, types of crimes that are the most violent, days taken to close cases, crimes committed by gender, etc.** 

**This is an effort from my end to use machine learning to solve social problems for more targetted public policy interventions.By applying data science for social good on a problem like this, it would be common knowledge for the public, law enforcers and the government to take corrective action in times of need so that crime rates can be tackled at the right time and be significantly reduced**


In [None]:
df=pd.read_csv('../input/indian-crimes-dataset/crime_dataset_india.csv')
df.head()

In [None]:
df.shape,df.describe()

# Feature Engineering and Data Preparation

#### Feature Engineering Steps

1. Extracted Days taken to close cases column out of date case closed and date of occurence column
2. Exrtracted The time of the day when the crime was committed by extracting the hours from the date of occurence column
3. Extracted Days taken to report the crime from date reported and the time of occurence column
4. Grouped Victim's Age Groups from their Ages

In [None]:
import datetime as dt
#converting the dates into date time format
df['Date of Occurrence'] = pd.to_datetime(df['Date of Occurrence'])

# Replace missing values in 'Date Case Closed' with values from 'Date of Occurrence'
df['Date Case Closed'] = df['Date Case Closed'].fillna(df['Date of Occurrence'])
df['Date Case Closed'] = pd.to_datetime(df['Date Case Closed'])

#Add another column to estimate the no. of days to close cases
df['Days_to_close_cases']=df['Date Case Closed']-df['Date of Occurrence']

# Extract the days from the column
df['Days_to_close_cases']=df['Days_to_close_cases'].dt.days

#dropping the irrelevant columns that were used in extraction
df=df.drop(['Date Case Closed','Date of Occurrence'],axis=1)

# Convert Time of occurence column to datetime
df['Time of Occurrence'] = pd.to_datetime(df['Time of Occurrence'], format='%d-%m-%Y %H:%M')

# Extract only the time portion
df['Time'] = df['Time of Occurrence'].dt.time

# Convert time to a datetime object for easier comparison
df['Time of Occurrence_Hour'] = df['Time of Occurrence'].dt.hour

# Extract another column to create a new column based on the time
df['Time of Day'] = np.where((df['Time of Occurrence_Hour'] >= 6) & (df['Time of Occurrence_Hour'] < 12), 'Morning',
                    np.where((df['Time of Occurrence_Hour'] >= 12) & (df['Time of Occurrence_Hour'] < 16), 'Afternoon',
                    np.where((df['Time of Occurrence_Hour'] >= 16) & (df['Time of Occurrence_Hour'] < 18), 'Evening',
                    'Night')))

#dropping the actual column as it irrelevant now
df=df.drop(['Time of Occurrence_Hour','Time'],axis=1)

# creating another column -days_taken_to_report_cr. This will be Time of occurence - date reported

#convert date reported to datetime format
df['Date Reported'] = pd.to_datetime(df['Date Reported'], format='%d-%m-%Y %H:%M')


#extract another column to calculate the time taken to report the crime from date of occurence
df['Days_taken_to_report_cr']=df['Date Reported']-df['Time of Occurrence']

# Extract the days from the column
df['Days_taken_to_report_cr']=df['Days_taken_to_report_cr'].dt.days

#Dropping the irrelevant columns used in extraction
df=df.drop(['Date Reported','Time of Occurrence'],axis=1)

# Extract another column Of Victim Age Groups to create a new column based on time
df['Victim_age_group'] = np.where((df['Victim Age'] <= 12), 'Child',
                    np.where((df['Victim Age'] > 12) & (df['Victim Age'] < 18), 'Adolescent',
                    np.where((df['Victim Age'] >= 18) & (df['Victim Age'] < 25), 'Young Adult',
                    np.where((df['Victim Age'] >= 25) & (df['Victim Age'] < 40), 'Adult',
                    np.where((df['Victim Age'] >= 40) & (df['Victim Age'] <= 60), 'Middle Age', 'Old')))))

df=df.drop(['Crime Code','Victim Age'],axis=1)
df.head()

In [None]:
df.groupby(['Crime Domain','Crime Description'])[['Report Number']].count().reset_index().sort_values(by='Report Number',ascending=False)

# From this we can clearly see that other crimes are also quite violent. So we will tag other crimes as violent

# Extracting the Target Variable

Before extracting the target varible I first checked the crime description and crime domain and no. of crimes committed in each category.
I then extracted a column called as Crime_Type that basically checked if crime domain was violent crime or crime description were in Kidnapping,Homicide and drug offence,then it was tagged as more violent and the rest were tagged as less violent .

You can basically do more research into this to see which crimes should be labbelled under serious crimes and which ones should'nt. This assumption can also be based on any rule set by a law enforcement agency or a goovernment body.

1. I checked crime frequency by no. of cases reported and grouped crime zones into high, low and intermediate based on a certain threshold. 

2. For instance, crimes reported more than 1500 were tagged under high crime zones, between 700 to 1500 were tagged in intermediate crime zones and 
   those with lesser than 700 were tagged as safe zones

3. I finally added another column in the main dataset appending these zones as safety zones.
   
4. To extract the target variable, I put a condition i.e. wherever the crime type had serious violence and safety zones were in high crime zones,
   I tagged those areas as unsafe and the rest as safe.

In [None]:
#Generate the crime type variable by extracting violent crimes and other crimes by crime domain and crime desription as violent crime/unsafe
# and trafic fatality and fire accident as non-violent/less unsafe

serious_crimes = ['KIDNAPPING', 'HOMICIDE', 'DRUG OFFENSE']

# Extract another column to create a new column based on the time
df['Crime_Type'] = np.where(
    (df['Crime Domain'] == "Violent Crime") | (df['Crime Description'].isin(serious_crimes)),'serious violence', 'less violence')


# Count the no. of crimes reported
safety_zones=df.groupby(['City'])['Report Number'].count().reset_index().sort_values(by=['Report Number'],ascending=False)

# Grouping crime zones based on the frequency of cases reported
highZone_crimes= safety_zones[safety_zones['Report Number'] > 1500]['City'].tolist()
intermediateZones_crimes = safety_zones[(safety_zones['Report Number'] >= 700) & (safety_zones['Report Number'] <= 1500)]['City'].tolist()
lowZone_crimes = safety_zones[safety_zones['Report Number'] < 700]['City'].tolist()

# Define the conditions and corresponding values for the safety zones
conditions = [
    df['City'].isin(highZone_crimes),
    df['City'].isin(intermediateZones_crimes),
    df['City'].isin(lowZone_crimes)
]
choices = ['high crime zone', 'intermediate crime zone', 'low crime zone']

# Add the 'safety_zones' column based on the conditions
df['safety_zones'] = np.select(conditions, choices, default='unknown zone')

# Extract another column to create a new column based on the time
conditions = [
    # Unsafe conditions
    ((df['Crime_Type'] == "serious violence") & (df['safety_zones'] == "high crime zone")) |
    ((df['Crime_Type'] == "less violence") & (df['safety_zones'] == "high crime zone")),

    
    # Safe conditions
    ((df['Crime_Type'] == "less violence") & (df['safety_zones'] == "low crime zone")) |
    ((df['Crime_Type'] == "less violence") & (df['safety_zones'] == "intermediate crime zone"))|
    ((df['Crime_Type'] == "serious violence") & (df['safety_zones'] == "low crime zone"))|
    ((df['Crime_Type'] == "serious violence") & (df['safety_zones'] == "intermediate crime zone"))
    
]

# Define values for each condition
values = ['unsafe', 'safe']


# Apply conditions to create the 'safety_tag' column
df['safety_tag'] = np.select(conditions, values, default='neutral') 

df['safety_tag'].value_counts(normalize=True)*100

# Visualizing the Data

I have used   all the features that have been extracted to visually represent what story the data is telling.

**Data Insights**
1. Around half the cases reported occur at night and a quarter reported, occur sometime during the day.
2. Maximum no. of police deployed are in the high crime zone cities and lowest are in the low crime zone cities, so the availability of law enforcers does not seem to be the problem
3. Most of the cities have more or less the same no. of cases that are both open and closed. In some of the high crime zone ares like Delhi,Mumbai and Bangalore, the number of cases that are open are more than the ones that are closed. Where as in other cities like Ahemedabad, Jaipur, Luckhnow, Surat, the no. of cases closed are marginally more than the no. of cases open.
4. Maximun no. of crimes are committed againt middle and old age groups and least no. of crimes against children
5. The top tier 1 cities in the country are typically unsafe going by the crime frequency and types of crimes committed. Tier 2 and 3 cities are more in the safer zone, meaning that these are also some of the safest most livable cities.
6. Needless to say, most of the crimes are committed against women, with other crimes taking centre stage followed by violent crimes.
7. On an avergage, it takes about 42 days to close cases for all victim age groups.

### Correlation Matrix with Target Feature

In [None]:
from sklearn.preprocessing import LabelEncoder
# Label encode the categorical features to numeric values
label_encoder = LabelEncoder()

df1=df.drop(['safety_zones','Crime_Type','City_Encoded'],axis=1)

# Encoding categorical columns
categorical_columns = ['City', 'Crime Description', 'Victim Gender', 'Weapon Used', 
                       'Crime Domain', 'Case Closed', 'Time of Day', 'Victim_age_group', 'safety_tag']  # Add other categorical columns as needed

for col in categorical_columns:
    df1[col] = label_encoder.fit_transform(df1[col])

# Calculate the correlation matrix
corr_matrix = df1.corr()

# Show correlation matrix with 'safety_tag' specifically
safety_tag_corr = corr_matrix['safety_tag'].sort_values(ascending=False)

# Plot the heatmap of the correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of All Features with Safety Tag')
plt.show()

# Print correlation of all features with safety_tag
print(safety_tag_corr)

In [None]:
time_of_occ=df.groupby(['Time of Day','Victim_age_group'])[['Report Number']].count().reset_index().sort_values(by='Report Number', ascending=False)
time_of_occ.rename(columns={"Report Number": "Cases_reported"}, inplace=True)
# Group by 'Time of Day' and sum up the 'Cases_reported'
time_of_day_cases = time_of_occ.groupby('Time of Day')['Cases_reported'].sum()
age_groups=time_of_occ.groupby('Victim_age_group')['Cases_reported'].sum()

#Plot a pie chart
plt.figure(figsize=(4, 4))
plt.pie(time_of_day_cases, labels=time_of_day_cases.index, autopct='%1.1f%%', colors=['#FFC898', '#A1FF8A', '#FFADA6', '#8ACBFF'])
plt.title("Occurence of Crimes by Time")
plt.show()

In [None]:
police_deployed=df.groupby(['City','safety_zones'])['Police Deployed'].sum().reset_index().sort_values(by='Police Deployed')
# Creating the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=police_deployed, x='Police Deployed', y='City', hue='safety_zones', palette='coolwarm')

# Customizing plot for clarity
plt.title("Total Police Deployed by City and Safety Zones", fontsize=16, fontweight='bold')
plt.xlabel("Total Police Deployed", fontsize=14)
plt.ylabel("City", fontsize=14)
plt.legend(title="Safety Zones", loc='upper right')

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
case_closed=df.groupby(['City','Case Closed'])['Report Number'].count().reset_index().sort_values(by='Report Number',ascending=False)

# Creating the bar plot
plt.figure(figsize=(10, 6))

sns.barplot(data=case_closed, x='City', y='Report Number', hue='Case Closed')

# Customizing plot for clarity
plt.title("Status of cases by City", fontsize=16, fontweight='bold')
plt.xlabel("City", fontsize=14)
plt.ylabel("Count of cases", fontsize=14)
plt.legend(title="status of cases", loc='upper right')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
df.groupby(['Victim_age_group'])['Report Number'].count().reset_index()

In [None]:
age_groups=df.groupby(['Victim_age_group'])['Report Number'].count().reset_index()
#Plot a pie chart
plt.figure(figsize=(4, 4))
plt.pie(age_groups['Report Number'], labels=age_groups['Victim_age_group'], autopct='%1.1f%%', colors=['#FFC898', '#A1FF8A', '#FFADA6', '#8ACBFF'])
plt.title("Total Number of Crimes Reported by Age Groups")
plt.show()

In [None]:
# Plotting Crime Domain vs Victim Gender
plt.figure(figsize=(5, 5))

# Using a color palette 'pastel' for better aesthetics
sns.countplot(data=df, x='Crime Domain', hue='Victim Gender', palette='pastel')

# Customizing the title and labels for better clarity
plt.title('Crime Domain vs Victim Gender', fontsize=12, fontweight='bold')
plt.xlabel('Crime Domain', fontsize=10)
plt.ylabel('Count of Reports', fontsize=10)

# Improving readability with gridlines
plt.grid(True, linestyle='--', alpha=0.6)

# Rotating x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')

# Displaying the plot
plt.tight_layout()  # Ensures everything fits without overlap
plt.show()

In [None]:
# Plotting Days to Close Cases by Victim Age Group
plt.figure(figsize=(5, 5))

# Creating the pointplot
sns.pointplot(data=df, x='Victim_age_group', y='Days_to_close_cases', palette='viridis', markers='o', linestyles='-', dodge=True)

# Customizing the title and labels for better clarity
plt.title('Days to Close Cases by Victim Age Group (Pointplot)', fontsize=14, fontweight='bold')
plt.xlabel('Victim Age Group', fontsize=12)
plt.ylabel('Mean Days to Close Cases', fontsize=12)

# Displaying the plot
plt.tight_layout()  # Ensures everything fits without overlap
plt.show()

# Data Preprocessing

In [None]:
# segregating numerical and categorical features to treat missing values and do label 
df_num=df.select_dtypes(exclude='object')
df_num.drop(['Report Number'],axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df['Weapon Used'].value_counts()

In [None]:
# Label encoding for 'City'
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['City_Encoded'] = label_encoder.fit_transform(df['City'])

catcol = df[['Victim Gender', 'Case Closed', 'Time of Day', 'Victim_age_group', 'Crime_Type']]
#Creating dummy variables for other categorical columns
df_categorical= pd.get_dummies(catcol, drop_first=False).astype(int)
df_categorical

In [None]:
df_final=pd.concat([df,df_categorical],axis=1)

#Columns_to_drop=['City','Victim Gender','Weapon Used','Case Closed','Time of Day','Victim_age_group','Crime_Type_serious violence']

df_final=df_final.drop(['City','Victim Gender','Weapon Used','Case Closed','Time of Day','Victim_age_group','Crime_Type_serious violence','Crime_Type_less violence'],axis=1)
df_final

In [None]:
#drop crime type and safety zones since these vars were used in extracting the target variable and check for any missing value
df_final.drop(['Crime_Type','safety_zones'],axis=1,inplace=True)
df_final.isnull().sum()


## Feature Selection

Have used techniques like Recurrsive feature elimination and select K best

In [None]:
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
from sklearn import metrics

In [None]:
# Drop all irrelavant columns and make the target variable numeric
df_final['safety_tag'] = df_final['safety_tag'].replace({'safe': 1, 'unsafe': 0})
df_final.drop(['Report Number'],axis=1,inplace=True)

In [None]:
#Define X and Y variables
X=df_final.drop('safety_tag',axis=1)
y=df_final['safety_tag']

In [None]:
#Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
rfe = RFE(RandomForestClassifier(), n_features_to_select=10)
rfe = rfe.fit(X, y)
imp_vars_RFE = list(X.columns[rfe.support_])
imp_vars_RFE

In [None]:
# Select K Best
from sklearn.feature_selection import SelectKBest, f_classif

SKB = SelectKBest(f_classif, k=10).fit(X, y )
imp_vars_SKB = list(X.columns[SKB.get_support()])
imp_vars_SKB

In [None]:
# after comparing both the techniques above it is better to take the features from RFE as the final features
final_features=df_final[imp_vars_RFE]
final_features['safety_tag']=df_final['safety_tag']
final_features

## Train test Split

install walta to compare the best models that run in parallel to each other

In [None]:
!pip install wolta

In [None]:
from wolta.data_tools import multi_split

In [None]:
from sklearn.model_selection import train_test_split
from wolta.data_tools import multi_split
X_train, X_test, y_trains, y_tests = multi_split(final_features, ['safety_tag'], 0.2, times=200)
#X_train, X_test, y_train, y_test = train_test_split(final_X, y, test_size = 0.2)
X_train.shape, X_test.shape

# Model Development

In [None]:
#Build the model
# import compare models from wolta and run all models in parallel and store them in an object
from wolta.model_tools import compare_models

results = compare_models(
    'clf',
    ['ada', 'cat', 'lbm', 'raf', 'dtr', 'ext', 'per', 'rdg'],
    ['acc', 'precision', 'f1'],
    X_train, y_trains['safety_tag'], X_test, y_tests['safety_tag'],
    get_result=True
)

In [None]:
## Get the best model based on 'acc' (accuracy)
from sklearn.ensemble import RandomForestClassifier

#best_model_name = Random Forest since all other models seem to be overfitting
raf_model = RandomForestClassifier()  # Initialize Random Forest model

# Fit the model on training data
raf_model.fit(X_train, y_trains['safety_tag'])

# Predict using the trained model
y_pred = raf_model.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_tests['safety_tag'], y_pred)
print(metrics.classification_report(y_tests['safety_tag'], y_pred))

In [None]:
# Predict using Random Forest
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=raf_model.classes_, yticklabels=raf_model.classes_)
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()