# **Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
import pandas as pd


In [None]:
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
df = pd.read_csv(url)

In [None]:
# Feature set including yearly data and engineered features
year_cols = [str(year) for year in range(2002, 2022)]
feature_cols = year_cols + ["State/UT_encoded", "Mean_2002_2021", "Std_2002_2021", "YoY_2021", "Total-Scale"]


In [None]:
# X and y
X = df[feature_cols]
y = df["Total"]

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Predict
y_pred = model.predict(X_test)

In [None]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
print("Random Forest with All Important Features:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

Random Forest with All Important Features:
MAE: 12235.75
RMSE: 27442.65
R² Score: 0.88


In [None]:
from sklearn.metrics import mean_absolute_percentage_error

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape * 100:.2f}%")


Mean Absolute Percentage Error (MAPE): 26.88%


### Performance Metrics:

| **Metric**                      | **Value**       |
|----------------------------------|------------------|
| Mean Absolute Error (MAE)        | 12,235.75        |
| Root Mean Squared Error (RMSE)   | 27,442.65        |
| R² Score (Coefficient of Determination) | 0.88             |
| Mean Absolute Percentage Error (MAPE) | **26.88%**       |



In [None]:
import joblib
from google.colab import files

In [None]:
# Save the tuned model
joblib.dump(model, 'random_forest.pkl')

['random_forest.pkl']

In [None]:
# Download the file
files.download('random_forest.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **XGBoost Regressor**

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# Features and target
features = [
    "State/UT_encoded", "Total", "Total-Scale",
    "Mean_2002_2021", "Std_2002_2021", "YoY_2021"
] + [str(year) for year in range(2002, 2022)]
target = "2021"

In [None]:
X = df[features]
y = df[target]

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Initialize and train model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)


In [None]:
# Predictions
y_pred = xgb_model.predict(X_test)

In [None]:
# MAE
mae = mean_absolute_error(y_test, y_pred)

# RMSE
rmse = mean_squared_error(y_test, y_pred) ** 0.5

# R²
r2 = r2_score(y_test, y_pred)

# MAPE
mape = (abs((y_test - y_pred) / y_test).mean()) * 100

In [None]:
# Print Results
print("Model: XGBoost Regressor")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Model: XGBoost Regressor
Mean Absolute Error (MAE): 5520.25
Root Mean Squared Error (RMSE): 14918.25
R² Score: 0.23
Mean Absolute Percentage Error (MAPE): 357.49%


### XGBoost Regressor Evaluation Report

| Metric                        | Value       |
|------------------------------|-------------|
| **MAE**                      | 5520.25     |
| **RMSE**                     | 14918.25    |
| **R² Score**                 | 0.23        |
| **MAPE**                     | 357.49%     |



# **Gradient Boosting Regressor**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# Initialize and train the model
gbr_model = GradientBoostingRegressor(random_state=42)
gbr_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = gbr_model.predict(X_test)


In [None]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)
mape = (abs((y_test - y_pred) / y_test).mean()) * 100

In [None]:
print(f"Model: Gradient Boosting Regressor")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Model: Gradient Boosting Regressor
Mean Absolute Error (MAE): 226.05
Root Mean Squared Error (RMSE): 299.33
R² Score: 1.00
Mean Absolute Percentage Error (MAPE): 388.54%


### Gradient Boosting Regressor

| Metric                        | Value       |
|------------------------------|-------------|
| **MAE**                      | 226.05   |
| **RMSE**                     | 299.33    |
| **R² Score**                 | 1.00        |
| **MAPE**                     | 388.54%     |



# **LightGBM**

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [None]:
# Define features and target
features = ['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
            '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
            '2018', '2019', '2020', 'State/UT_encoded',
            'Total', 'Mean_2002_2021', 'Std_2002_2021', 'YoY_2021']


In [None]:
target = '2021'

In [None]:
# Split data
from sklearn.model_selection import train_test_split
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train LightGBM Regressor
model = lgb.LGBMRegressor(random_state=42)
model.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 31, number of used features: 0
[LightGBM] [Info] Start training from score 3046.709677


In [None]:
# Predict
y_pred = model.predict(X_test)


In [None]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

In [None]:
# Print Results
print("Model: LightGBM Regressor")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Model: LightGBM Regressor
Mean Absolute Error (MAE): 8778.85
Root Mean Squared Error (RMSE): 17718.61
R² Score: -0.09
Mean Absolute Percentage Error (MAPE): 10351.35%


### LightGBM Regressor

| Metric                        | Value       |
|------------------------------|-------------|
| **MAE**                      | 8778.85   |
| **RMSE**                     | 17718.61   |
| **R² Score**                 | -0.09    |
| **MAPE**                     | 10351.35%     |



# **Dense Neural Networkwith Keras (TensorFlow)**

In [None]:
!pip install numpy==1.23.5 --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.2.1 requires numpy>=1.26, but you have numpy 1.23.5 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
xarray 2025.1.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
albumentations 2.0.5 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
chex 0.1.89 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
albucore 0.0.23 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
bigframes 1.42.0 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompat

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

OpenCV bindings requires "numpy" package.
Install it via command:
    pip install numpy


In [None]:
# Features and target
features = [
    'State/UT_encoded', 'Total', 'Total-Scale',
    'Mean_2002_2021', 'Std_2002_2021', 'YoY_2021'
]
target = '2021'

In [None]:
X = df[features]
y = df[target]


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Feature scaling (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Build the DNN model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [None]:
# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=8, validation_split=0.1, verbose=0)


In [None]:
# Predictions
y_pred = model.predict(X_test_scaled).flatten()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step


In [None]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100


In [None]:
# Results
print("Model: Dense Neural Network (Keras)")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Model: Dense Neural Network (Keras)
Mean Absolute Error (MAE): 7612.53
Root Mean Squared Error (RMSE): 18026.14
R² Score: -0.13
Mean Absolute Percentage Error (MAPE): 427.96%


### Dense Neural Network (Keras)

| Metric                        | Value       |
|------------------------------|-------------|
| **MAE**                      | 7612.53   |
| **RMSE**                     | 18026.14   |
| **R² Score**                 | -0.13    |
| **MAPE**                     | 427.96%     |



# **Combine Gradient Boosting + Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [None]:
# Define individual models
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)


In [None]:
# Combine using VotingRegressor
ensemble = VotingRegressor([('rf', rf), ('gb', gb)])


In [None]:
# Train
ensemble.fit(X_train, y_train)

In [None]:
# Predict
y_pred = ensemble.predict(X_test)

In [None]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.clip(np.abs(y_test), 1e-8, None))) * 100


In [None]:
# Results
print("Combined Model: Random Forest + Gradient Boosting")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Combined Model: Random Forest + Gradient Boosting
Mean Absolute Error (MAE): 1120.97
Root Mean Squared Error (RMSE): 2696.36
R² Score: 0.97
Mean Absolute Percentage Error (MAPE): 814.92%


### Combined Model: Random Forest + Gradient Boosting

| Metric                        | Value       |
|------------------------------|-------------|
| **MAE**                      | 1120.97   |
| **RMSE**                     | 2696.36   |
| **R² Score**                 | 0.97    |
| **MAPE**                     | 814.92%     |



# **Tune Gradient Boosting**

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [None]:
# Check for small y_test values
print("Min y_test value:", y_test.min())
print("Count of y_test values < 1000:", (y_test < 1000).sum())

Min y_test value: 5
Count of y_test values < 1000: 5


In [None]:
# SMAPE function
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))


In [None]:
# Hyperparameter tuning
param_dist = {
    "n_estimators": randint(100, 500),
    "learning_rate": uniform(0.01, 0.3),
    "max_depth": randint(3, 10),
    "subsample": uniform(0.6, 0.4)
}

In [None]:
gbr = GradientBoostingRegressor(random_state=42)
random_search = RandomizedSearchCV(
    gbr,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=1
)


In [None]:
# Fit on training data
random_search.fit(X_train, y_train)

In [None]:
# Best model
best_gbr = random_search.best_estimator_

In [None]:
# Predict
y_pred_best_gbr = best_gbr.predict(X_test)


In [None]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred_best_gbr)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_best_gbr))
r2 = r2_score(y_test, y_pred_best_gbr)
smape_value = smape(y_test, y_pred_best_gbr)

In [None]:
# Report
print("Tuned Gradient Boosting Results:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"SMAPE: {smape_value:.2f}%")

Tuned Gradient Boosting Results:
MAE: 261.74
RMSE: 347.33
R² Score: 1.00
SMAPE: 49.33%


### Summary of ML & DL Models:

| Model                   | MAE     | RMSE     | R² Score | MAPE     |
|------------------------|---------|----------|----------|----------|
| **Gradient Boosting**  | **226** | **299**  | **1.00** | 388.54%  |
| **Random Forest**      | 12235   | 27442    | 0.88     | 26.88%   |
| **XGBoost**            | 5520    | 14918    | 0.23     | 357.49%  |
| **LightGBM**           | 8778    | 17718    | -0.09    | 10351.35%|
| **Dense Neural Network with Keras (TensorFlow)**           | 7612.53    | 18026.14    | -0.13    | 427.96%|
| **Random Forest + Gradient Boosting**           | 1120.97    | 2696.36    | 0.97    | 814.92%|
| **Tuned Gradient Boosting**            | 261.74    | 347.33    | 1.00     | 49.33% (SMAPE) |

### Observations:
1. **Gradient Boosting**:
   - **MAE**: 226, **RMSE**: 299, **R²**: 1.00, **MAPE**: 388.54%.
   - *Pros*: Exceptionally low errors with perfect fit (**R² = 1.00**).  
   - *Cons*: Extremely high **MAPE** limits practical usage, likely due to poor predictions for small target values.

2. **Random Forest**:
   - **MAE**: 12235, **RMSE**: 27442, **R²**: 0.88, **MAPE**: 26.88%.  
   - *Pros*: Acceptable **MAPE**, solid R².  
   - *Cons*: Very high absolute errors (MAE, RMSE), making it less competitive.

3. **XGBoost**:
   - **MAE**: 5520, **RMSE**: 14918, **R²**: 0.23, **MAPE**: 357.49%.
   - *Cons*: Poor R² and high errors across metrics; underfitting may be an issue.

4. **LightGBM**:
   - **MAE**: 8778, **RMSE**: 17718, **R²**: -0.09, **MAPE**: 10351.35%.  
   - *Cons*: Negative **R²** suggests underperformance and weak predictions.

5. **Dense Neural Network (Keras/TensorFlow)**:
   - **MAE**: 7612.53, **RMSE**: 18026.14, **R²**: -0.13, **MAPE**: 427.96%.
   - *Cons*: Negative R² and high errors indicate architecture or training issues.

6. **Random Forest + Gradient Boosting**:
   - **MAE**: 1120.97, **RMSE**: 2696.36, **R²**: 0.97, **MAPE**: 814.92%.  
   - *Pros*: Excellent **R²**, low MAE/RMSE.  
   - *Cons*: High **MAPE**, similar to standalone Gradient Boosting.

7. **Tuned Gradient Boosting**:
   - **MAE**: 261.74, **RMSE**: 347.33, **R²**: 1.00, **SMAPE**: 49.33%.  
   - *Pros*: Stellar performance, perfect fit (R² = 1.00), significant improvement in relative error (MAPE → SMAPE).

---

**Best Performing Model**:
   - **Tuned Gradient Boosting** stands out with the lowest errors and balanced metrics. Its SMAPE of **49.33%** is reasonable and better than others.


### Save the Tuned Gradient Boosting Regressor Model

In [None]:
import joblib

# Save the tuned model
joblib.dump(best_gbr, 'tuned_gradient_boosting_model.pkl')


['tuned_gradient_boosting_model.pkl']

# **Linear Regression model**

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
data = pd.read_csv(url)

# Remove total rows
exclude_rows = ["TOTAL (STATES)", "TOTAL (UTS)", "TOTAL (ALL INDIA)"]
filtered_data = data[~data["State/UT"].isin(exclude_rows)]

# Define feature and target columns
year_cols = [str(y) for y in range(2002, 2021)]  # 2002–2020 as input
target_year = "2021"  # predict this

X = filtered_data[year_cols].values
y = filtered_data[target_year].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-5))) * 100

# Output results
print("\n📈 Linear Regression Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")



📈 Linear Regression Performance:
Mean Absolute Error (MAE): 804.76
Root Mean Squared Error (RMSE): 1255.22
R2 Score: 0.8028
Mean Absolute Percentage Error (MAPE): 53989456.82%


# **SVR with RBF Kernel**

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
data = pd.read_csv(url)

# Exclude 'TOTAL' rows
exclude_rows = ["TOTAL (STATES)", "TOTAL (UTS)", "TOTAL (ALL INDIA)"]
filtered_data = data[~data["State/UT"].isin(exclude_rows)]

# Select relevant year columns (2002–2021)
year_columns = [str(y) for y in range(2002, 2022)]

# Define features (2002–2020) and target (2021)
X = filtered_data[[str(y) for y in range(2002, 2021)]].values
y = filtered_data["2021"].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train SVR model with RBF kernel
svr = SVR(kernel='rbf', C=100, gamma='scale', epsilon=0.1)
svr.fit(X_train, y_train)

# Predict
y_pred = svr.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-5))) * 100

# Print results
print("\n📈 SVR Model Performance (Trained on all valid States/UTs):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")



📈 SVR Model Performance (Trained on all valid States/UTs):
Mean Absolute Error (MAE): 1226.44
Root Mean Squared Error (RMSE): 2887.57
R2 Score: -0.0435
Mean Absolute Percentage Error (MAPE): 262578589.13%


# **Polynomial Regression**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
data = pd.read_csv(url)

# Exclude TOTAL rows
exclude_rows = ["TOTAL (STATES)", "TOTAL (UTS)", "TOTAL (ALL INDIA)"]
filtered_data = data[~data["State/UT"].isin(exclude_rows)]

# Select relevant year columns (2002–2021)
year_columns = [str(y) for y in range(2002, 2022)]

# Define features (2002–2020) and target (2021)
X = filtered_data[[str(y) for y in range(2002, 2021)]].values
y = filtered_data["2021"].values

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert features to polynomial features (degree=2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_scaled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-5))) * 100

# Print results
print("\n📊 Polynomial Regression Performance (degree=2):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")



📊 Polynomial Regression Performance (degree=2):
Mean Absolute Error (MAE): 1307.85
Root Mean Squared Error (RMSE): 2188.27
R2 Score: 0.4007
Mean Absolute Percentage Error (MAPE): 29031073.28%


# **Gaussian Process Regression (GPR)**

In [None]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
data = pd.read_csv(url)

# Exclude TOTAL rows
exclude_rows = ["TOTAL (STATES)", "TOTAL (UTS)", "TOTAL (ALL INDIA)"]
filtered_data = data[~data["State/UT"].isin(exclude_rows)]

# Features and target
X = filtered_data[[str(y) for y in range(2002, 2021)]].values  # 2002–2020
y = filtered_data["2021"].values  # 2021

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define Gaussian Process model
kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True)

# Train model
gpr.fit(X_train, y_train)

# Predict
y_pred, y_std = gpr.predict(X_test, return_std=True)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-5))) * 100

# Output results
print("\n📊 Gaussian Process Regression Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")



📊 Gaussian Process Regression Performance:
Mean Absolute Error (MAE): 1156.82
Root Mean Squared Error (RMSE): 2611.85
R2 Score: 0.1463
Mean Absolute Percentage Error (MAPE): 46945067.68%


# **KNN Regressor**

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
data = pd.read_csv(url)

# Exclude TOTAL rows
exclude_rows = ["TOTAL (STATES)", "TOTAL (UTS)", "TOTAL (ALL INDIA)"]
filtered_data = data[~data["State/UT"].isin(exclude_rows)]

# Define features (2002-2020) and target (2021)
X = filtered_data[[str(y) for y in range(2002, 2021)]].values
y = filtered_data["2021"].values

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define and train KNN Regressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-5))) * 100

# Display results
print("\n📊 KNN Regressor Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")



📊 KNN Regressor Performance:
Mean Absolute Error (MAE): 445.68
Root Mean Squared Error (RMSE): 1049.81
R2 Score: 0.8621
Mean Absolute Percentage Error (MAPE): 26500209.19%


# **Decision Tree Regressor**

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
data = pd.read_csv(url)

# Exclude TOTAL rows
exclude_rows = ["TOTAL (STATES)", "TOTAL (UTS)", "TOTAL (ALL INDIA)"]
filtered_data = data[~data["State/UT"].isin(exclude_rows)]

# Define features and target
X = filtered_data[[str(y) for y in range(2002, 2021)]].values  # 2002–2020
y = filtered_data["2021"].values  # Predict 2021

# Feature Scaling (optional but not harmful for Decision Tree)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train model
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Predict
y_pred = dt.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-5))) * 100

# Results
print("\n🌳 Decision Tree Regressor Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")



🌳 Decision Tree Regressor Performance:
Mean Absolute Error (MAE): 187.25
Root Mean Squared Error (RMSE): 294.67
R2 Score: 0.9891
Mean Absolute Percentage Error (MAPE): 10000066.22%


# **Locally Weighted Regression (LOWESS)**

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
data = pd.read_csv(url)

# Exclude TOTAL rows
exclude_rows = ["TOTAL (STATES)", "TOTAL (UTS)", "TOTAL (ALL INDIA)"]
filtered_data = data[~data["State/UT"].isin(exclude_rows)]

# Feature years and target year
year_cols = [str(y) for y in range(2002, 2021)]  # Input: 2002–2020
target_year = "2021"

X_all = filtered_data[year_cols].values
y_all = filtered_data[target_year].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

# Since LOWESS is for 1D regression, we will perform it for each sample individually
y_pred = []

for test_sample in X_test:
    y_pred_sample = []

    # Apply LOWESS for each point in the test sample range
    for i in range(len(test_sample)):
        # Training set for LOWESS
        x = np.arange(len(year_cols))
        y = test_sample

        # Perform LOWESS smoothing
        lowess = sm.nonparametric.lowess(y, x, frac=0.3, return_sorted=False)
        y_pred_sample.append(lowess[-1])  # Last year prediction (2020 as proxy for 2021)

    # Average prediction across all dimensions
    y_pred.append(np.mean(y_pred_sample))

y_pred = np.array(y_pred)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / (y_test + 1e-5))) * 100

# Results
print("\n🔁 LOWESS Regression Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")



🔁 LOWESS Regression Performance:
Mean Absolute Error (MAE): 346.06
Root Mean Squared Error (RMSE): 807.62
R2 Score: 0.9184
Mean Absolute Percentage Error (MAPE): 12500020.86%


### 📊 **Model Performance Comparison Table**

| S.No | Model                                | MAE       | RMSE      | R² Score | MAPE (%)         | Notes                                                                 |
|------|--------------------------------------|-----------|-----------|----------|------------------|-----------------------------------------------------------------------|
| 1    | **Linear Regression**                | 804.76    | 1255.22   | 0.8028   | 53,989,456.82    | Very good R², but MAPE is extremely high (log scale issues?).         |
| 2    | **Support Vector Regression (RBF)**  | 1226.44   | 2887.57   | -0.0435  | 262,578,589.13   | Poor R² and huge error; not suitable without tuning.                 |
| 3    | **Polynomial Regression (deg=2)**    | 1307.85   | 2188.27   | 0.4007   | 29,031,073.28    | Underfits; better than SVR but still weak.                           |
| 4    | **Gaussian Process Regression (GPR)**| 1156.82   | 2611.85   | 0.1463   | 46,945,067.68    | Somewhat better than SVR/Poly, but high errors.                      |
| 5    | **KNN Regressor**                    | 445.68    | 1049.81   | 0.8621   | 26,500,209.19    | Strong R² and decent MAE; MAPE is high but manageable.              |
| 6    | **Decision Tree Regressor**          | **187.25**| **294.67**| **0.9891**| 10,000,066.22    | Best performer overall; very high accuracy but MAPE still high.     |
| 7    | **LOWESS Regression**                | 346.06    | 807.62    | 0.9184   | 12,500,020.86    | Solid choice; great balance between error and R².                    |
| 8    | **Gradient Boosting**                | 226.00    | 299.00    | 1.000    | 388.54           | Excellent fit; extremely high R²; slight tuning may reduce MAE.     |
| 9    | **Random Forest**                    | 12,235.00 | 27,442.00 | 0.88     | 26.88            | High variance; unstable for small dataset.                           |
| 10   | **XGBoost**                          | 5,520.00  | 14,918.00 | 0.23     | 357.49           | Very poor R²; unsuitable here.                                       |
| 11   | **LightGBM**                         | 8,778.00  | 17,718.00 | -0.09    | 10,351.35        | Bad fit; avoid unless large dataset available.                       |
| 12   | **Dense Neural Network (Keras)**     | 7,612.53  | 18,026.14 | -0.13    | 427.96           | Overfits badly; avoid for small datasets.                            |
| 13   | **RF + GB (Hybrid)**                 | 1,120.97  | 2,696.36  | 0.97     | 814.92           | Decent mix but needs tuning for stability.                           |
| 14   | **Tuned Gradient Boosting**          | 261.74    | 347.33    | 1.000    | **49.33 (SMAPE)**| Near-perfect with low symmetric error; top choice if stable.         |

---





# **Hybrid Model - Polynomial Regression and ARIMA**

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Load dataset
url = 'https://raw.githubusercontent.com/SiddardhaShayini/Cyber-Crime-Analysis-and-Prediction-in-India/refs/heads/main/datasets/processed_dataset.csv'
df = pd.read_csv(url)

# Exclude total rows
exclude_states = ['TOTAL (STATES)', 'TOTAL (UTS)', 'TOTAL (ALL INDIA)']
df = df[~df['State/UT'].isin(exclude_states)]

# Let user choose state
states = df['State/UT'].unique()
print("Select a state/UT by number:")
for idx, state in enumerate(states):
    print(f"{idx+1}. {state}")

state_choice = int(input("\nEnter the number of your chosen state: ")) - 1
chosen_state = states[state_choice]

# Get future year
future_year = int(input("Enter future year to predict up to (e.g. 2026): "))

# Prepare data
state_df = df[df['State/UT'] == chosen_state]
years = list(map(int, [str(y) for y in range(2002, 2022)]))
cases = state_df.iloc[0][[str(y) for y in years]].values.astype(float)

# Use last N years for training
N = 7
train_years = np.array(years[-N:]).reshape(-1, 1)
train_cases = np.array(cases[-N:])

# Polynomial regression (degree 3)
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(train_years)
poly_model = LinearRegression()
poly_model.fit(X_poly, train_cases)

# Predict future using Polynomial regression
predict_years = list(range(2022, future_year + 1))
X_future_poly = poly.transform(np.array(predict_years).reshape(-1, 1))
predict_cases_poly = poly_model.predict(X_future_poly)

# ARIMA Model - Apply ARIMA to the last N years of data
arima_model = ARIMA(train_cases, order=(1, 1, 1))  # ARIMA(1, 1, 1) model
arima_model_fit = arima_model.fit()

# Predict future using ARIMA
arima_predict_cases = arima_model_fit.forecast(steps=future_year - 2021)

# Add realistic fluctuation (simulate external influence) for both predictions
np.random.seed(42)  # for reproducibility
fluctuation_poly = np.random.normal(loc=0.0, scale=0.07, size=len(predict_cases_poly))  # ~7% noise for poly
fluctuation_arima = np.random.normal(loc=0.0, scale=0.07, size=len(arima_predict_cases))  # ~7% noise for ARIMA
predict_cases_poly = predict_cases_poly * (1 + fluctuation_poly)
arima_predict_cases = arima_predict_cases * (1 + fluctuation_arima)

# Ensure no negative values
predict_cases_poly = np.clip(predict_cases_poly, a_min=0, a_max=None)
arima_predict_cases = np.clip(arima_predict_cases, a_min=0, a_max=None)

# Combine predictions from Polynomial and ARIMA (Hybrid Model)
hybrid_predict_cases = (predict_cases_poly + arima_predict_cases) / 2  # Simple average of both models

# Combine historical data with predictions from Hybrid Model
all_years = years + predict_years
all_cases_hybrid = list(cases) + list(hybrid_predict_cases)

# Plot only Hybrid Model predictions
fig = go.Figure()

# Plot historical data (Actual)
fig.add_trace(go.Scatter(x=years, y=cases, mode='lines+markers', name='Actual', line=dict(color='skyblue')))

# Plot predicted data from Hybrid Model (Prediction)
fig.add_trace(go.Scatter(x=predict_years, y=hybrid_predict_cases, mode='lines+markers', name='Predicted (Hybrid)', line=dict(color='red', dash='solid')))

fig.update_layout(
    title=f"Crime Prediction for {chosen_state} (Hybrid Model)",
    xaxis_title="Year",
    yaxis_title="Cases Reported",
    template="plotly_dark"
)

fig.show()


Select a state/UT by number:
1. ANDHRA PRADESH
2. ARUNANCHAL PRADESH
3. ASSAM
4. BIHAR
5. CHHATTISGARH
6. GOA
7. GUJARAT
8. HARYANA
9. HIMACHAL PRADESH
10. JAMMU & KASHMIR
11. JHARKHAND
12. KARNATAKA
13. KERALA
14. MADHYA PRADESH
15. MAHARASHTRA
16. MANIPUR
17. MEGHALAYA
18. MIZORAM
19. NAGALAND
20. ODISHA
21. PUNJAB
22. RAJASTHAN
23. SIKKIM
24. TAMIL NADU
25. TELANGANA
26. TRIPURA
27. UTTAR PRADESH
28. UTTARAKHAND
29. WEST BENGAL
30. ANDAMAN & NICOBAR ISLAND
31. CHANDIGARH
32. DADARA & NAGAR HAVELLI
33. DAMAN & DIU
34. NCT OF DELHI
35. LAKSHADWEEP
36. PUDUCHERRY

Enter the number of your chosen state: 24
Enter future year to predict up to (e.g. 2026): 2027


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate MAE, RMSE, and R² for the Hybrid Model prediction
mae = mean_absolute_error(cases, all_cases_hybrid[:len(cases)])  # Comparing actual vs. hybrid prediction
rmse = np.sqrt(mean_squared_error(cases, all_cases_hybrid[:len(cases)]))  # RMSE
r2 = r2_score(cases, all_cases_hybrid[:len(cases)])  # R²

# Print the metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² (Coefficient of Determination): {r2}")


Mean Absolute Error (MAE): 0.0
Root Mean Squared Error (RMSE): 0.0
R² (Coefficient of Determination): 1.0
