In [1]:
#importing Libraries and load data
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv("crime_dataset_india.csv")
print(df.head())

   Report Number Date Reported Date of Occurrence Time of Occurrence  \
0              1      2/1/2020           1/1/2020            1:11:00   
1              2      1/1/2020           2/1/2020            6:26:00   
2              3      2/1/2020           3/1/2020           14:30:00   
3              4      1/1/2020           4/1/2020           14:46:00   
4              5      1/1/2020           5/1/2020           16:51:00   

        City Crime Description  Victim Age Victim Gender   Weapon Used  \
0  Ahmedabad    IDENTITY THEFT          16             M  Blunt Object   
1    Chennai          HOMICIDE          37             M        Poison   
2   Ludhiana        KIDNAPPING          48             F  Blunt Object   
3       Pune          BURGLARY          49             F       Firearm   
4       Pune         VANDALISM          30             F         Other   

  Case Closed  
0          No  
1          No  
2          No  
3         Yes  
4         Yes  


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40160 entries, 0 to 40159
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Report Number       40160 non-null  int64 
 1   Date Reported       40160 non-null  object
 2   Date of Occurrence  40160 non-null  object
 3   Time of Occurrence  40160 non-null  object
 4   City                40160 non-null  object
 5   Crime Description   40160 non-null  object
 6   Victim Age          40160 non-null  int64 
 7   Victim Gender       40160 non-null  object
 8   Weapon Used         34370 non-null  object
 9   Case Closed         40160 non-null  object
dtypes: int64(2), object(8)
memory usage: 3.1+ MB
None


In [4]:
print(df.isnull().sum())

Report Number            0
Date Reported            0
Date of Occurrence       0
Time of Occurrence       0
City                     0
Crime Description        0
Victim Age               0
Victim Gender            0
Weapon Used           5790
Case Closed              0
dtype: int64


In [5]:
df = df.dropna(subset=['Weapon Used'])


In [6]:
print(df.isnull().sum())

Report Number         0
Date Reported         0
Date of Occurrence    0
Time of Occurrence    0
City                  0
Crime Description     0
Victim Age            0
Victim Gender         0
Weapon Used           0
Case Closed           0
dtype: int64


In [7]:
#cleaning the column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('-', '_')


In [8]:
# Convert to datetime
df['Date_Reported'] = pd.to_datetime(df['Date_Reported'], errors='coerce')
df['Date_of_Occurrence'] = pd.to_datetime(df['Date_of_Occurrence'], errors='coerce')

# Convert time to proper format (HH:MM:SS)
df['Time_of_Occurrence'] = pd.to_datetime(df['Time_of_Occurrence'], format='%H:%M:%S', errors='coerce').dt.time


In [9]:
# Merging date and time in one column
df['DateTime_of_Occurrence'] = pd.to_datetime(df['Date_of_Occurrence'].astype(str) + ' ' +
                                              df['Time_of_Occurrence'].astype(str), errors='coerce')


In [10]:
print(df.columns)

Index(['Report_Number', 'Date_Reported', 'Date_of_Occurrence',
       'Time_of_Occurrence', 'City', 'Crime_Description', 'Victim_Age',
       'Victim_Gender', 'Weapon_Used', 'Case_Closed',
       'DateTime_of_Occurrence'],
      dtype='object')


In [11]:
# Now drop the old date and time columns
df.drop(['Date_of_Occurrence', 'Time_of_Occurrence'], axis=1, inplace=True)

In [12]:
print(df.columns)

Index(['Report_Number', 'Date_Reported', 'City', 'Crime_Description',
       'Victim_Age', 'Victim_Gender', 'Weapon_Used', 'Case_Closed',
       'DateTime_of_Occurrence'],
      dtype='object')


In [13]:
# Standardize text (remove extra spaces, make consistent case)
df['City'] = df['City'].str.strip().str.title()
df['Crime_Description'] = df['Crime_Description'].str.strip().str.upper()
df['Victim_Gender'] = df['Victim_Gender'].str.upper()
df['Weapon_Used'] = df['Weapon_Used'].str.strip().str.title()
df['Case_Closed'] = df['Case_Closed'].str.strip().str.capitalize()


In [14]:
# Convert age to numeric
df['Victim_Age'] = pd.to_numeric(df['Victim_Age'], errors='coerce')

# Handle unrealistic ages (e.g. <0 or >120)
df.loc[(df['Victim_Age'] < 0) | (df['Victim_Age'] > 120), 'Victim_Age'] = np.nan


In [15]:
#Handling Duplicate Values
df.drop_duplicates(inplace=True)


In [16]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['City', 'Crime_Description', 'Victim_Gender', 'Weapon_Used', 'Case_Closed']

encoder = LabelEncoder()
for col in label_cols:
    df[col] = encoder.fit_transform(df[col].astype(str))


In [17]:
#Now saving the cleaned dataset
df.to_csv("crime_data_cleaned.csv",index=False)

In [18]:
import pandas as pd

# Ensure Date is in datetime format
df['DateTime_of_Occurrence'] = pd.to_datetime(df['DateTime_of_Occurrence'], errors='coerce')

df['Hour'] = df['DateTime_of_Occurrence'].dt.hour
df['DayOfWeek'] = df['DateTime_of_Occurrence'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['Month'] = df['DateTime_of_Occurrence'].dt.month

In [19]:
def risk_from_time(hour):
    if 22 <= hour or hour <= 5:
        return 'High'
    else:
        return 'Low'


In [20]:
high_risk_crimes = ['ASSAULT', 'ROBBERY', 'HOMICIDE', 'RAPE', 'KIDNAPPING', 'BURGLARY']

def risk_from_crime(crime):
    if pd.isna(crime):
        return 'Low'
    for keyword in high_risk_crimes:
        if keyword.lower() in crime.lower():
            return 'High'
    return 'Low'

In [21]:
df.columns = df.columns.str.strip()  # removes leading/trailing spaces
print(df.columns)
type(df)
df['Hour'] = pd.to_numeric(df['Hour'], errors='coerce')
print(df['Hour'].isnull().sum())  # check how many nulls


Index(['Report_Number', 'Date_Reported', 'City', 'Crime_Description',
       'Victim_Age', 'Victim_Gender', 'Weapon_Used', 'Case_Closed',
       'DateTime_of_Occurrence', 'Hour', 'DayOfWeek', 'Month'],
      dtype='object')
20900


In [22]:
def calculate_risk_score(row):
    score = 0

    # --- Time-based risk ---
    time_risk = risk_from_time(row['Hour'])
    if time_risk == 'High':
        score += 2

    # --- Crime-based risk ---
    crime_risk = risk_from_crime(str(row['Crime_Description']))
    if crime_risk == 'High':
        score += 2

    # --- Final numeric risk category ---
    # If total score >= 2 â†’ High risk, else Low
    if score >= 2:
        return 2   # High
    else:
        return 1   # Low


# --- Apply to DataFrame ---
df.columns = df.columns.str.strip()  # Clean column names
df['Hour'] = pd.to_numeric(df['Hour'], errors='coerce')

# Apply the function row-wise
df['Risk_Score'] = df.apply(calculate_risk_score, axis=1)

# Map numeric to categorical labels
df['Risk'] = df['Risk_Score'].map({1: 'Low', 2: 'High'})

# Check distribution
print(df['Risk'].value_counts())


Risk
Low     29843
High     4527
Name: count, dtype: int64


In [23]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
X = df[['City', 'Hour', 'DayOfWeek', 'Month']]
y = df['Risk']


In [24]:
print(df['Risk'].value_counts())


Risk
Low     29843
High     4527
Name: count, dtype: int64


In [25]:
print(df.columns.tolist())
print("\nClass distribution:")
print(y.value_counts())


['Report_Number', 'Date_Reported', 'City', 'Crime_Description', 'Victim_Age', 'Victim_Gender', 'Weapon_Used', 'Case_Closed', 'DateTime_of_Occurrence', 'Hour', 'DayOfWeek', 'Month', 'Risk_Score', 'Risk']

Class distribution:
Risk
Low     29843
High     4527
Name: count, dtype: int64


In [26]:
print(df['Risk'].value_counts())


Risk
Low     29843
High     4527
Name: count, dtype: int64


In [27]:
print(len(df))

34370


In [27]:
# Drop rows where key time-related columns are missing
df = df.dropna(subset=['DateTime_of_Occurrence', 'Hour', 'Month', 'DayOfWeek'])

# Optional: Reset the index after dropping rows
df = df.reset_index(drop=True)

print(f"âœ… Cleaned dataset shape: {df.shape}")
print(df[['DateTime_of_Occurrence', 'Hour', 'Month', 'DayOfWeek']].isnull().sum())


âœ… Cleaned dataset shape: (13470, 14)
DateTime_of_Occurrence    0
Hour                      0
Month                     0
DayOfWeek                 0
dtype: int64


In [28]:
print(len(df))

13470


In [29]:
print(df['Risk'].value_counts())

Risk
Low     8943
High    4527
Name: count, dtype: int64


In [30]:
# Step 3: Split data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
le = LabelEncoder()
y_encoded = le.fit_transform(y)   
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Step 4: Handle missing values (SMOTE canâ€™t process NaN)
X_train = X_train.fillna(0)

# Step 5: Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Step 6: Check class distribution after balancing
unique, counts = np.unique(y_train_res, return_counts=True)
print(dict(zip(unique, counts)))

{np.int64(0): np.int64(23874), np.int64(1): np.int64(23874)}


In [31]:
print(len(df))

13470


In [32]:
print("Original training size:", len(X_train))
print("After SMOTE:", len(X_train_res))


Original training size: 27496
After SMOTE: 47748


In [33]:
import xgboost
print(xgboost.__version__)
print(xgboost.__file__)
!pip install xgboost 1.7.4
print(xgboost.__version__)

3.1.0
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\__init__.py
3.1.0


ERROR: Could not find a version that satisfies the requirement 1.7.4 (from versions: none)

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for 1.7.4


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

# --- Step 1: Define features and target ---
X = df[['City', 'Hour', 'DayOfWeek', 'Month', 'Crime_Description']]
y = df['Risk'].map({'Low': 0, 'High': 1})

# --- Step 2: Encode categorical columns ---
X_encoded = X.copy()
label_encoders = {}
for col in X_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    label_encoders[col] = le

# --- Step 3: Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

xgb_model_less_overfit = XGBClassifier(
    n_estimators=50,        # More trees but weaker (with lower learning rate)
    learning_rate=0.05,     # Softer learning rate
    max_depth=2,            # Shallower trees â€” prevents memorization
    min_child_weight=12,    # Each leaf needs more samples
    subsample=0.5,          # Use only 50% of samples per tree
    colsample_bytree=0.5,   # Use only 50% of features per tree
    gamma=1,                # Penalize unnecessary splits heavily
    reg_lambda=15,          # Very strong L2 regularization
    reg_alpha=6,            # Strong L1 regularization
    random_state=42,
    objective='binary:logistic',
    eval_metric='logloss'
)


# --- Step 5: Early stopping to prevent overfitting ---
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# --- Step 6: Evaluate model ---
y_pred = xgb_model.predict(X_test)

print("ðŸ”¹ XGBoost Model Evaluation (Regularized) ðŸ”¹")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


NameError: name 'xgb_model' is not defined

In [104]:
#Now Trying to Reduce Accuracy as my model is overfitting
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

# --- Step 1: Define features and target ---
X = df[['City', 'Hour', 'DayOfWeek', 'Month', 'Crime_Description']]
y = df['Risk'].map({'Low': 0, 'High': 1})

# --- Step 2: Encode categorical columns ---
X_encoded = X.copy()
label_encoders = {}
for col in X_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    label_encoders[col] = le

# --- Step 3: Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

xgb_model_less_overfit = XGBClassifier(
    n_estimators=50,        # More trees but weaker (with lower learning rate)
    learning_rate=0.05,     # Softer learning rate
    max_depth=2,            # Shallower trees â€” prevents memorization
    min_child_weight=7,    # Each leaf needs more samples
    subsample=0.6,          # Use only 50% of samples per tree
    colsample_bytree=0.5,   # Use only 50% of features per tree
    gamma=5,                # Penalize unnecessary splits heavily
    reg_lambda=15,          # Very strong L2 regularization
    reg_alpha=6,            # Strong L1 regularization
    random_state=42,
    early_stopping_rounds=10,
    objective='binary:logistic',
    eval_metric='logloss'
)


# --- Step 5: Early stopping to prevent overfitting ---
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# --- Step 6: Evaluate model ---
y_pred = xgb_model.predict(X_test)

print("ðŸ”¹ XGBoost Model Evaluation (Regularized) ðŸ”¹")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ðŸ”¹ XGBoost Model Evaluation (Regularized) ðŸ”¹
Accuracy: 1.0
Confusion Matrix:
 [[1789    0]
 [   0  905]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1789
           1       1.00      1.00      1.00       905

    accuracy                           1.00      2694
   macro avg       1.00      1.00      1.00      2694
weighted avg       1.00      1.00      1.00      2694



In [37]:
# --- Step 1: Imports ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# --- Step 2: Define features and target ---
X = df[['City', 'Hour', 'DayOfWeek', 'Month', 'Crime_Description']]
y = df['Risk'].map({'Low': 0, 'High': 1})  # Encode target variable

# --- Step 3: Encode categorical columns ---
X_encoded = X.copy()
label_encoders = {}
for col in X_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    label_encoders[col] = le

# --- Step 4: Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

# --- Step 5: Define Random Forest model (with anti-overfitting settings) ---
rf_model = RandomForestClassifier(
    n_estimators=30,        # number of trees
    max_depth=3,             # shallow trees to avoid memorization
    min_samples_split=30,    # require more samples to split
    min_samples_leaf=10,     # require more samples per leaf
    max_features=0.5,        # use only 50% of features per tree
    bootstrap=True,          # use bootstrapped samples
    oob_score=True,          # out-of-bag validation
    random_state=42,
    class_weight='balanced'  # handle imbalance
)

# --- Step 6: Train model ---
rf_model.fit(X_train, y_train)

# --- Step 7: Evaluate model ---
y_pred = rf_model.predict(X_test)

print("ðŸŒ² Random Forest Model Evaluation ðŸŒ²")
print("OOB Score:", rf_model.oob_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ðŸŒ² Random Forest Model Evaluation ðŸŒ²
OOB Score: 0.9741091314031181
Accuracy: 0.9970304380103935
Confusion Matrix:
 [[1789    0]
 [   8  897]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1789
           1       1.00      0.99      1.00       905

    accuracy                           1.00      2694
   macro avg       1.00      1.00      1.00      2694
weighted avg       1.00      1.00      1.00      2694

