In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:

from pymongo import MongoClient

# MongoDB connection
client = MongoClient("mongodb+srv://reshma_madala:admin123@crimewatch.ntwx0xx.mongodb.net/")
db = client["crimesdata"]
collection = db["crimestable"]

# Load data from MongoDB
data = pd.DataFrame(list(collection.find()))
print(f"Loaded {len(data)} records")

# Drop MongoDB's default _id column
data.drop(columns=["_id"], inplace=True)
data.head()


Loaded 40160 records


Unnamed: 0,Report Number,Date Reported,Date of Occurrence,Time of Occurrence,Crime Code,Crime Description,Victim Age,Weapon Used,Police Deployed,Case Closed,...,City_Vasai,City_Visakhapatnam,Crime Domain_Other Crime,Crime Domain_Traffic Fatality,Crime Domain_Violent Crime,Victim Gender_M,Victim Gender_X,Hour of Day,Day of Week,Month
0,1,2020-01-02 00:00:00,2020-01-01 00:00:00,1.0,576,IDENTITY THEFT,16,Blunt Object,13,No,...,False,False,False,False,True,True,False,1,2.0,1.0
1,2,2020-01-01 19:00:00,2020-01-01 01:00:00,6.0,128,HOMICIDE,37,Poison,9,No,...,False,False,True,False,False,True,False,6,2.0,1.0
2,3,2020-01-02 05:00:00,2020-01-01 02:00:00,14.0,271,KIDNAPPING,48,Blunt Object,15,No,...,False,False,True,False,False,False,False,14,2.0,1.0
3,4,2020-01-01 05:00:00,2020-01-01 03:00:00,14.0,170,BURGLARY,49,Firearm,1,Yes,...,False,False,True,False,False,False,False,14,2.0,1.0
4,5,2020-01-01 21:00:00,2020-01-01 04:00:00,16.0,421,VANDALISM,30,Other,18,Yes,...,False,False,True,False,False,False,False,16,2.0,1.0


In [3]:
data.info()
data.describe(include='all')
data.isnull().sum().sort_values(ascending=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40160 entries, 0 to 40159
Data columns (total 47 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Report Number                  40160 non-null  int64  
 1   Date Reported                  40160 non-null  object 
 2   Date of Occurrence             15840 non-null  object 
 3   Time of Occurrence             40160 non-null  float64
 4   Crime Code                     40160 non-null  int64  
 5   Crime Description              40160 non-null  object 
 6   Victim Age                     40160 non-null  int64  
 7   Weapon Used                    40160 non-null  object 
 8   Police Deployed                40160 non-null  int64  
 9   Case Closed                    40160 non-null  object 
 10  Date Case Closed               40160 non-null  object 
 11  City_Ahmedabad                 40160 non-null  bool   
 12  City_Bangalore                 40160 non-null 

Month                            24320
Date of Occurrence               24320
Day of Week                      24320
City_Thane                           0
City_Meerut                          0
City_Mumbai                          0
City_Nagpur                          0
City_Nashik                          0
City_Patna                           0
City_Pune                            0
City_Rajkot                          0
City_Srinagar                        0
City_Surat                           0
City_Varanasi                        0
City_Lucknow                         0
City_Vasai                           0
City_Visakhapatnam                   0
Crime Domain_Other Crime             0
Crime Domain_Traffic Fatality        0
Crime Domain_Violent Crime           0
Victim Gender_M                      0
Victim Gender_X                      0
Hour of Day                          0
City_Ludhiana                        0
Report Number                        0
Date Reported            

In [4]:

# Handle missing values (Fill missing values for "Day of Week", "Month", "Date of Occurrence")
data["Day of Week"].fillna(data["Day of Week"].mode()[0], inplace=True)
data["Month"].fillna(data["Month"].mode()[0], inplace=True)
data["Date of Occurrence"].fillna(method='ffill', inplace=True)  # Fill forward for missing date


  data["Date of Occurrence"].fillna(method='ffill', inplace=True)  # Fill forward for missing date


In [5]:

# Feature Engineering: Extract useful features from 'Date of Occurrence'
data['Date of Occurrence'] = pd.to_datetime(data['Date of Occurrence'])
data['Hour of Day'] = data['Date of Occurrence'].dt.hour
data['Day of Week'] = data['Date of Occurrence'].dt.dayofweek
data['Month'] = data['Date of Occurrence'].dt.month


In [6]:

# Age Group Binning
data['Age Group'] = pd.cut(data['Victim Age'], bins=[0, 12, 18, 65, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])


In [7]:


# Encoding Categorical Features
label_encoder = LabelEncoder()
data['Crime_Description_Code'] = label_encoder.fit_transform(data['Crime Description'])
data['Weapon_Code'] = label_encoder.fit_transform(data['Weapon Used'])

In [8]:
# One-Hot Encoding for 'City_*' and other categorical columns
city_columns = [col for col in data.columns if 'City_' in col]
data = pd.get_dummies(data, columns=city_columns, drop_first=True)

In [9]:
# Drop columns that aren't necessary for prediction
data.drop(columns=['Report Number', 'Date Reported', 'Date Case Closed', 'Case Closed', 'Crime Domain_Other Crime', 'Crime Domain_Traffic Fatality'], inplace=True)


In [10]:

# Select Features (X) and Target (y)
X = data.drop(columns=['Crime_Description_Code', 'Age Group'])
y_crime_type = data['Crime_Description_Code']  # Target for crime type prediction
y_age_group = data['Age Group']  # Target for victim age group prediction

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_crime_type, test_size=0.2, random_state=42)

In [12]:
# Apply SMOTE to handle imbalanced data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

TypeError: Cannot cast DatetimeArray to dtype float64