In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
crime = pd.read_csv(r"crime-dataset.csv")

In [3]:
population = pd.read_csv(r"Population.csv")

In [4]:
population['State/UT'].unique()

array(['Uttar Pradesh', 'Maharashtra', 'Bihar', 'West Bengal',
       'Madhya Pradesh', 'Tamil Nadu', 'Rajasthan', 'Karnataka',
       'Gujarat', 'Andhra Pradesh', 'Odisha', 'Telangana', 'Kerala',
       'Jharkhand', 'Assam', 'Punjab', 'Chhattisgarh', 'Haryana',
       'Delhi (UT)', 'Jammu and Kashmir', 'Uttarakhand',
       'Himachal Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland',
       'Goa', 'Arunachal Pradesh', 'Puducherry (UT)', 'Mizoram',
       'Chandigarh (UT)', 'Sikkim', 'Andaman and Nicobar Islands (UT)',
       'Dadra and Nagar Haveli (UT)', 'Daman and Diu (UT)',
       'Lakshadweep (UT)', 'Total (India)'], dtype=object)

In [5]:
# Standardize state names in crime dataset
crime['STATE/UT'] = crime['STATE/UT'].str.title()

# Define a mapping dictionary for name mismatches
state_mapping = {
    "Jammu & Kashmir": "Jammu and Kashmir",
    "A&N Islands": "Andaman and Nicobar Islands (UT)",
    "D&N Haveli": "Dadra and Nagar Haveli (UT)",
    "Daman & Diu": "Daman and Diu (UT)",
    "Delhi Ut": "Delhi (UT)",
    "Puducherry": "Puducherry (UT)",
    'A & N Islands': 'Andaman and Nicobar Islands (UT)',
    'D & N Haveli': 'Dadra and Nagar Haveli (UT)',
    'Lakshadweep': 'Lakshadweep (UT)',
    'Chandigarh':'Chandigarh (UT)'
}

# Apply mapping
crime['STATE/UT'] = crime['STATE/UT'].replace(state_mapping)


In [6]:
crime.columns

Index(['STATE/UT', 'DISTRICT', 'YEAR', 'MURDER', 'ATTEMPT TO MURDER',
       'CULPABLE HOMICIDE NOT AMOUNTING TO MURDER', 'RAPE', 'CUSTODIAL RAPE',
       'OTHER RAPE', 'KIDNAPPING & ABDUCTION',
       'KIDNAPPING AND ABDUCTION OF WOMEN AND GIRLS',
       'KIDNAPPING AND ABDUCTION OF OTHERS', 'DACOITY',
       'PREPARATION AND ASSEMBLY FOR DACOITY', 'ROBBERY', 'BURGLARY', 'THEFT',
       'AUTO THEFT', 'OTHER THEFT', 'RIOTS', 'CRIMINAL BREACH OF TRUST',
       'CHEATING', 'COUNTERFIETING', 'ARSON', 'HURT/GREVIOUS HURT',
       'DOWRY DEATHS', 'ASSAULT ON WOMEN WITH INTENT TO OUTRAGE HER MODESTY',
       'INSULT TO MODESTY OF WOMEN', 'CRUELTY BY HUSBAND OR HIS RELATIVES',
       'IMPORTATION OF GIRLS FROM FOREIGN COUNTRIES',
       'CAUSING DEATH BY NEGLIGENCE', 'OTHER IPC CRIMES', 'TOTAL IPC CRIMES'],
      dtype='object')

In [7]:
population.columns

Index(['Sl No', 'State/UT', 'Population'], dtype='object')

In [8]:
crime['STATE/UT'].unique()

array(['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh',
       'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala',
       'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram',
       'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim',
       'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal', 'Andaman and Nicobar Islands (UT)',
       'Chandigarh (UT)', 'Dadra and Nagar Haveli (UT)',
       'Daman and Diu (UT)', 'Delhi (UT)', 'Lakshadweep (UT)',
       'Puducherry (UT)'], dtype=object)

In [9]:

population['State/UT'].unique()

array(['Uttar Pradesh', 'Maharashtra', 'Bihar', 'West Bengal',
       'Madhya Pradesh', 'Tamil Nadu', 'Rajasthan', 'Karnataka',
       'Gujarat', 'Andhra Pradesh', 'Odisha', 'Telangana', 'Kerala',
       'Jharkhand', 'Assam', 'Punjab', 'Chhattisgarh', 'Haryana',
       'Delhi (UT)', 'Jammu and Kashmir', 'Uttarakhand',
       'Himachal Pradesh', 'Tripura', 'Meghalaya', 'Manipur', 'Nagaland',
       'Goa', 'Arunachal Pradesh', 'Puducherry (UT)', 'Mizoram',
       'Chandigarh (UT)', 'Sikkim', 'Andaman and Nicobar Islands (UT)',
       'Dadra and Nagar Haveli (UT)', 'Daman and Diu (UT)',
       'Lakshadweep (UT)', 'Total (India)'], dtype=object)

In [10]:
crime_grouped = crime.groupby(['STATE/UT', 'YEAR']).sum().reset_index()



In [11]:
# Merge state-level aggregated data with population
merged_data = crime_grouped.merge(population, left_on='STATE/UT', right_on='State/UT', how='left')


In [12]:
# Calculate Crime Rate as (Total Crimes / Population) * 100,000
merged_data['Crime Rate'] = (merged_data['TOTAL IPC CRIMES'] / merged_data['Population']) * 100000


In [13]:
merged_data

Unnamed: 0,STATE/UT,YEAR,DISTRICT,MURDER,ATTEMPT TO MURDER,CULPABLE HOMICIDE NOT AMOUNTING TO MURDER,RAPE,CUSTODIAL RAPE,OTHER RAPE,KIDNAPPING & ABDUCTION,...,INSULT TO MODESTY OF WOMEN,CRUELTY BY HUSBAND OR HIS RELATIVES,IMPORTATION OF GIRLS FROM FOREIGN COUNTRIES,CAUSING DEATH BY NEGLIGENCE,OTHER IPC CRIMES,TOTAL IPC CRIMES,Sl No,State/UT,Population,Crime Rate
0,Andaman and Nicobar Islands (UT),2001,ANDAMANNICOBARTOTAL,26,0,0,6,0,6,4,...,2,18,0,0,646,1316,33,Andaman and Nicobar Islands (UT),380581,345.787099
1,Andaman and Nicobar Islands (UT),2002,ANDAMANNICOBARTOTAL,34,6,2,4,0,4,4,...,6,8,0,0,656,1216,33,Andaman and Nicobar Islands (UT),380581,319.511484
2,Andaman and Nicobar Islands (UT),2003,ANDAMANNICOBARTOTAL,42,8,2,4,0,4,4,...,4,14,0,0,636,1288,33,Andaman and Nicobar Islands (UT),380581,338.429927
3,Andaman and Nicobar Islands (UT),2004,ANDAMANNICOBARTOTAL,30,2,4,20,0,20,6,...,6,10,0,12,774,1496,33,Andaman and Nicobar Islands (UT),380581,393.083207
4,Andaman and Nicobar Islands (UT),2005,ANDAMANNICOBARTOTAL,28,6,6,8,0,8,4,...,2,10,0,6,724,1364,33,Andaman and Nicobar Islands (UT),380581,358.399395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,West Bengal,2009,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4136,4238,1862,4672,0,4672,5500,...,216,32224,10,7246,77552,226072,4,West Bengal,91276115,247.679253
451,West Bengal,2010,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4796,4222,1260,4622,0,4622,6690,...,326,35592,16,5694,98192,259232,4,West Bengal,91276115,284.008582
452,West Bengal,2011,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4218,4484,972,4726,0,4726,8570,...,400,39544,0,6498,113228,286394,4,West Bengal,91276115,313.766641
453,West Bengal,2012,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4504,5708,1044,4092,0,4092,10234,...,1112,39730,24,8770,128964,322854,4,West Bengal,91276115,353.711373


In [14]:
merged_data['Crime Rate'].describe()

count     455.000000
mean      356.172850
std       182.949203
min        95.162290
25%       207.591660
50%       331.496682
75%       447.718108
max      1055.700641
Name: Crime Rate, dtype: float64

In [16]:
# Add lag features for TOTAL IPC CRIMES (trends)
merged_data['Previous Year Crimes'] = merged_data.groupby('STATE/UT')['TOTAL IPC CRIMES'].shift(1)

In [17]:
merged_data

Unnamed: 0,STATE/UT,YEAR,DISTRICT,MURDER,ATTEMPT TO MURDER,CULPABLE HOMICIDE NOT AMOUNTING TO MURDER,RAPE,CUSTODIAL RAPE,OTHER RAPE,KIDNAPPING & ABDUCTION,...,CRUELTY BY HUSBAND OR HIS RELATIVES,IMPORTATION OF GIRLS FROM FOREIGN COUNTRIES,CAUSING DEATH BY NEGLIGENCE,OTHER IPC CRIMES,TOTAL IPC CRIMES,Sl No,State/UT,Population,Crime Rate,Previous Year Crimes
0,Andaman and Nicobar Islands (UT),2001,ANDAMANNICOBARTOTAL,26,0,0,6,0,6,4,...,18,0,0,646,1316,33,Andaman and Nicobar Islands (UT),380581,345.787099,
1,Andaman and Nicobar Islands (UT),2002,ANDAMANNICOBARTOTAL,34,6,2,4,0,4,4,...,8,0,0,656,1216,33,Andaman and Nicobar Islands (UT),380581,319.511484,1316.0
2,Andaman and Nicobar Islands (UT),2003,ANDAMANNICOBARTOTAL,42,8,2,4,0,4,4,...,14,0,0,636,1288,33,Andaman and Nicobar Islands (UT),380581,338.429927,1216.0
3,Andaman and Nicobar Islands (UT),2004,ANDAMANNICOBARTOTAL,30,2,4,20,0,20,6,...,10,0,12,774,1496,33,Andaman and Nicobar Islands (UT),380581,393.083207,1288.0
4,Andaman and Nicobar Islands (UT),2005,ANDAMANNICOBARTOTAL,28,6,6,8,0,8,4,...,10,0,6,724,1364,33,Andaman and Nicobar Islands (UT),380581,358.399395,1496.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,West Bengal,2009,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4136,4238,1862,4672,0,4672,5500,...,32224,10,7246,77552,226072,4,West Bengal,91276115,247.679253,210838.0
451,West Bengal,2010,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4796,4222,1260,4622,0,4622,6690,...,35592,16,5694,98192,259232,4,West Bengal,91276115,284.008582,226072.0
452,West Bengal,2011,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4218,4484,972,4726,0,4726,8570,...,39544,0,6498,113228,286394,4,West Bengal,91276115,313.766641,259232.0
453,West Bengal,2012,24 PARGANAS NORTH24 PARGANAS SOUTHASANSOLBANKU...,4504,5708,1044,4092,0,4092,10234,...,39730,24,8770,128964,322854,4,West Bengal,91276115,353.711373,286394.0


In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [88]:
# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Population', 'TOTAL IPC CRIMES']),  # Scale numerical features
        ('cat', OneHotEncoder(), ['STATE/UT'])  # One-hot encode categorical features
    ]
)


In [89]:
# Initialize models
linear_model = LinearRegression()
decision_tree = DecisionTreeRegressor(random_state=42)
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Define pipelines
pipelines = {
    "Linear Regression": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', linear_model)
    ]),
    "Decision Tree": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', decision_tree)
    ]),
    "Random Forest": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', random_forest)
    ]),
    "XGBoost": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', xgb_model)
    ])
}


In [90]:
# Features (X) and target (y)
X = merged_data[['STATE/UT', 'YEAR', 'Population', 'TOTAL IPC CRIMES']]  # Replace with your feature columns
y = merged_data['Crime Rate']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [91]:
X.isna().sum()

STATE/UT            0
YEAR                0
Population          0
TOTAL IPC CRIMES    0
dtype: int64

In [92]:
results = {}

for name, pipeline in pipelines.items():
    print(f"Training {name}...")
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'MAE': mae,
        'MSE': mse,
        'R²': r2
    }

    print(f"{name} Results:")
    print(f"  MAE: {mae}")
    print(f"  MSE: {mse}")
    print(f"  R²: {r2}")
    print("-" * 30)


Training Linear Regression...
Linear Regression Results:
  MAE: 33.88663020644873
  MSE: 2599.361465000865
  R²: 0.934405742037821
------------------------------
Training Decision Tree...
Decision Tree Results:
  MAE: 14.904829494380568
  MSE: 817.8299784304735
  R²: 0.9793622582712428
------------------------------
Training Random Forest...
Random Forest Results:
  MAE: 12.608231933612073
  MSE: 668.4149583378987
  R²: 0.9831327101700414
------------------------------
Training XGBoost...
XGBoost Results:
  MAE: 27.66135733147595
  MSE: 1519.8336742139686
  R²: 0.9616473647746547
------------------------------


In [93]:
# Convert results to a DataFrame for comparison
results_df = pd.DataFrame(results).T  # Transpose for readability
print(results_df)


                         MAE          MSE        R²
Linear Regression  33.886630  2599.361465  0.934406
Decision Tree      14.904829   817.829978  0.979362
Random Forest      12.608232   668.414958  0.983133
XGBoost            27.661357  1519.833674  0.961647
