In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE

In [8]:
import os
print("Current working directory:", os.getcwd())


Current working directory: C:\Users\nikhi\AIML


In [5]:
def map_object_of_search(crime):
    if crime in ['HOMICIDE', 'ROBBERY', 'ASSAULT', 'DOMESTIC VIOLENCE','KIDNAPPING','SEXUAL ASSAULT']:
        return 'Offensive weapons'   # These crimes often involve knives, blunt objects, etc.
    elif crime in ['FIREARM OFFENSE']:
        return 'Firearms'
    elif crime in ['BURGLARY', 'VEHICLE - STOLEN', 'SHOPLIFTING', 'FRAUD', 'IDENTITY THEFT','COUNTERFEITING']:
        return 'Stolen goods'
    elif crime in ['DRUG OFFENSE']:
        return 'Controlled drugs'
    else:
        return 'Other'

df['Object_of_Search'] = df['Crime Description'].apply(map_object_of_search)

In [6]:
weapon_mapping = {
    'Other': 0,
    'Stolen goods' : 1,
    'Offensive weapons' : 2,
    'Firearms' : 3,
    'Controlled drugs' : 4
}

df['Weapon_Danger_Level'] = df['Object_of_Search'].map(weapon_mapping)

In [7]:
domain_encoder = LabelEncoder()
gender_encoder = OrdinalEncoder(categories=[['F','M','X']])
weapon_enc = OrdinalEncoder(categories=[['Other' ,'Blunt Object','Firearm', 'Knife' ,'Poison' ,'Explosives']],
                         dtype=int,
                         handle_unknown="use_encoded_value",
                         unknown_value=-1)

df["Weapon_encoded"] = weapon_enc.fit_transform(df[["Weapon Used"]].astype(str))
df["Genderencode"] = gender_encoder.fit_transform(df[["Victim Gender"]].astype(str))



df["Closed_Encoded"] = (df["Case Closed"] == "Yes").astype(int)

In [8]:
df['City'].unique()

array(['Mumbai', 'Kolkata', 'Delhi', 'Ghaziabad', 'Jaipur', 'Chennai',
       'Rajkot', 'Lucknow', 'Srinagar', 'Hyderabad', 'Bangalore',
       'Bhopal', 'Patna', 'Kanpur', 'Surat', 'Ahmedabad', 'Ludhiana',
       'Visakhapatnam', 'Thane', 'Nashik', 'Agra', 'Vasai', 'Varanasi',
       'Meerut', 'Kalyan', 'Pune', 'Nagpur', 'Faridabad', 'Indore'],
      dtype=object)

In [9]:
city_coords = {
    "Ahmedabad": (23.0225, 72.5714),
    "Chennai": (13.0827, 80.2707),
    "Ludhiana": (30.9005, 75.8573),
    "Pune": (18.5204, 73.8567),
    "Delhi": (28.7041, 77.1025),
    "Mumbai": (19.0760, 72.8777),
    "Surat": (21.1702, 72.8311),
    "Visakhapatnam": (17.6868, 83.2185),
    "Bangalore": (12.9716, 77.5946),
    "Kolkata": (22.5726, 88.3639),
    "Ghaziabad": (28.6692, 77.4538),
    "Hyderabad": (17.3850, 78.4867),
    "Jaipur": (26.9124, 75.7873),
    "Lucknow": (26.8467, 80.9462),
    "Bhopal": (23.2599, 77.4126),
    "Patna": (25.5941, 85.1376),
    "Kanpur": (26.4499, 80.3319),
    "Varanasi": (25.3176, 82.9739),
    "Nagpur": (21.1458, 79.0882),
    "Meerut": (28.9845, 77.7064),
    "Thane": (19.2183, 72.9781),
    "Indore": (22.7196, 75.8577),
    "Rajkot": (22.3039, 70.8022),
    "Vasai": (19.3919, 72.8397),
    "Agra": (27.1767, 78.0081),
    "Kalyan": (19.2403, 73.1305),
    "Nashik": (19.9975, 73.7898),
    "Srinagar": (34.0837, 74.7973),
    "Faridabad": (28.4089, 77.3178)
}

df["Lat"] = df["City"].apply(lambda x: city_coords.get(x, (np.nan, np.nan))[0])
df["Lon"] = df["City"].apply(lambda x: city_coords.get(x, (np.nan, np.nan))[1])



In [10]:
def dangerzone(domain):
    if domain in ['Other Crime', 'Fire Accident']:
        return 0  
    elif domain in ['Traffic Fatality', 'Violent Crime']:
        return 1  
    else:
        return 0 

    
    
df["DangerZone"] = df["Crime Domain"].apply(dangerzone)

In [11]:
crime_encoder = OrdinalEncoder(
    categories=[[
        'PUBLIC INTOXICATION',
        'TRAFFIC VIOLATION',
        'VANDALISM',
        'SHOPLIFTING',
        'CYBERCRIME',
        'COUNTERFEITING',
        'IDENTITY THEFT',
        'FRAUD',
        'VEHICLE - STOLEN',
        'ILLEGAL POSSESSION',
        'BURGLARY',
        'EXTORTION',
        'DRUG OFFENSE',
        'ARSON',
        'FIREARM OFFENSE',
        'ROBBERY',
        'DOMESTIC VIOLENCE',
        'ASSAULT',
        'SEXUAL ASSAULT',
        'KIDNAPPING',
        'HOMICIDE'
    ]],
    dtype=int,
    handle_unknown="use_encoded_value",
    unknown_value=-1)
df['Crime_Encoded'] = crime_encoder.fit_transform(df[["Crime Description"]].astype(str))
df['DayorNight'] = df['Hour'].apply(lambda x : 0 if x < 18 else 1)
df['Month'] = df["Date of Occurrence"].dt.month
df['Year'] = df["Date of Occurrence"].dt.year
df['weekday'] = df["Date of Occurrence"].dt.weekday
def month_to_india_season(month):
    if month in [1, 2]:
        return 0
    elif 3 <= month <= 5:
        return 1
    elif 6 <= month <= 9:
        return 2
    else:  # 10, 11, 12
        return 3

df['Season'] = df['Month'].apply(month_to_india_season)

In [12]:
def agebucketingg(age):
    if age >= 65:
        return 0
    elif 55 <= age <= 64:
        return 1
    elif 35 <= age <= 54:
        return 2
    elif 25 <= age <= 34:
        return 3
    elif 18 <= age <= 24:
        return 4
    elif 12 <= age <= 17:
        return 5
    else:  
        return 6

df['Agebucketing'] = df['Victim Age'].apply(agebucketingg)
df['Agebucketing']


33107    5
21722    2
33589    2
21274    3
20210    6
        ..
24650    0
34423    5
16614    3
24059    1
4613     2
Name: Agebucketing, Length: 6870, dtype: int64

In [13]:
print(df.isnull().sum())


Report Number             0
Date Reported             0
Date of Occurrence        0
Time of Occurrence     4229
City                      0
Crime Code                0
Crime Description         0
Victim Age                0
Victim Gender             0
Weapon Used               0
Crime Domain              0
Police Deployed           0
Case Closed               0
Date Case Closed       3443
Hour                      0
Object_of_Search          0
Weapon_Danger_Level       0
Weapon_encoded            0
Genderencode              0
Closed_Encoded            0
Lat                       0
Lon                       0
DangerZone                0
Crime_Encoded             0
DayorNight                0
Month                     0
Year                      0
weekday                   0
Season                    0
Agebucketing              0
dtype: int64


In [14]:
df['Hour_sin'] = np.sin(2*np.pi*df['Hour']/24)
df['Hour_cos'] = np.cos(2*np.pi*df['Hour']/24)
df['Month_sin'] = np.sin(2*np.pi*df['Month']/12)
df['Month_cos'] = np.cos(2*np.pi*df['Month']/12)
df['weekday_sin'] = np.sin(2*np.pi*df['weekday']/6)
df['weekday_cos'] = np.sin(2*np.pi*df['weekday']/6)


In [15]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Assume X has 'Lat' and 'Lon'
coords = df[['Lat', 'Lon']].copy()

# Standardize coordinates for clustering
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

# KMeans clustering to find crime hotspots
kmeans = KMeans(n_clusters=10, random_state=42)
df['Hotspot_ID'] = kmeans.fit_predict(coords_scaled)

# Distance to cluster center (how close each point is to hotspot center)
df['ClusterDist'] = np.min(kmeans.transform(coords_scaled), axis=1)




In [16]:
# Example: rolling crime count per cluster over last 7 days
# Assume X has 'Date' column in datetime format and 'Hotspot_ID'
# Every row is a crime, so set indicator to 1
df['Crime_Happened'] = 1

df = df.sort_values(['Hotspot_ID', 'Date of Occurrence'])
df = df.set_index('Date of Occurrence')

df['Past_7day_CrimeCount'] = df.groupby('Hotspot_ID')['Crime_Happened']\
                               .rolling('7D', min_periods=1).sum()\
                               .reset_index(0, drop=True)

df['Past_30day_CrimeCount'] = df.groupby('Hotspot_ID')['Crime_Happened']\
                                .rolling('30D', min_periods=1).sum()\
                                .reset_index(0, drop=True)

# Normalize (optional)
df['Past_7day_CrimeCount'] /= df.groupby('Hotspot_ID')['Crime_Happened'].transform('max')
df['Past_30day_CrimeCount'] /= df.groupby('Hotspot_ID')['Crime_Happened'].transform('max')

df = df.reset_index()


In [17]:
print(df.corr(numeric_only=True)["DangerZone"].sort_values(ascending=False))

DangerZone               1.00
Crime_Encoded            0.17
Weapon_Danger_Level      0.14
DayorNight               0.02
Genderencode             0.02
Agebucketing             0.02
Hour_cos                 0.01
Past_30day_CrimeCount    0.01
Month_sin                0.01
weekday_cos              0.01
weekday_sin              0.01
Crime Code               0.01
Hour                     0.01
weekday                  0.00
Lat                      0.00
Past_7day_CrimeCount     0.00
Hotspot_ID               0.00
Year                    -0.00
Report Number           -0.00
Police Deployed         -0.00
Closed_Encoded          -0.00
Month                   -0.00
Season                  -0.00
Hour_sin                -0.01
Lon                     -0.01
Weapon_encoded          -0.01
Month_cos               -0.02
ClusterDist             -0.02
Victim Age              -0.02
Crime_Happened            NaN
Name: DangerZone, dtype: float64


In [18]:
X = df[['Past_30day_CrimeCount','ClusterDist','Hour','Month_sin','Month_cos','Year','weekday_sin','weekday_cos','Season','DayorNight','Agebucketing','Genderencode','Past_7day_CrimeCount','Closed_Encoded','Weapon_Danger_Level']]
y = df["DangerZone"]  

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X,y)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,stratify=y_res, test_size=0.2, random_state=42)






In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score,classification_report

model = RandomForestClassifier(n_estimators=2000,max_depth=15,           # max depth of each tree
    min_samples_split=10,    
    min_samples_leaf=5,     
    max_features='sqrt',    
    bootstrap=True,random_state=42, class_weight='balanced')
model.fit(X_res,y_res)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test,y_pred)

In [20]:
acc*100

89.53678474114442

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       918
           1       0.90      0.89      0.89       917

    accuracy                           0.90      1835
   macro avg       0.90      0.90      0.90      1835
weighted avg       0.90      0.90      0.90      1835



In [26]:
X_res['Past_30day_CrimeCount'].mean()

np.float64(18.915116192156525)