In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, root_mean_squared_error,r2_score
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from pytorch_tabnet.tab_model import TabNetRegressor

from hyperopt import fmin, tpe, hp, Trials

import sys
import os

# Add the parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from pre_processing import preprocess_data


device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
def load_data():
    dataset = pd.read_excel(r"globalterrorismdb_2021Jan-June_1222dist.xlsx") # 2021-2021 June
    return dataset


## Data Loading and Preprocessing

In [3]:
dataset = load_data()
X, y, dataset = preprocess_data(dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
dataset.head()

Unnamed: 0,success,gname_freq,city_freq,country_freq,attacktype1_score,targtype1_score,weaptype1_score,gname_score,country_score,city_score,...,nkill_likelihood_score,region_3,region_5,region_6,region_8,region_9,region_10,region_11,region_12,nkill
0,1.0,0.228571,0.066667,0.167228,0.875,1.0,0.857143,0.977778,0.948718,0.123077,...,0.589934,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0
1,1.0,0.121429,0.033333,0.096521,0.5,0.842105,0.857143,0.911111,0.871795,0.030769,...,0.347526,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,0.0,0.033333,0.159371,0.875,0.894737,0.857143,0.0,0.641026,0.015385,...,0.47705,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.121429,0.033333,0.096521,0.875,0.894737,0.857143,0.911111,0.871795,0.092308,...,0.503175,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0
4,1.0,0.0,0.033333,0.093154,0.875,1.0,0.857143,0.022222,0.74359,0.030769,...,0.543654,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


## 1) ML: Linear Regression, Ridge Regression, Random Forrest, Gradient Boosting    
## Hyperparameter Tuning and Model Training

In [4]:
# Define hyperparameter grids for each model
param_grids = {
    'LinearRegression': {},
    'Ridge': {'alpha': [0.1, 0.3, 0.5, 1.0, 3.0, 5.0, 7.0, 10.0, 12.0, 15.0, 20.0]},
    'RandomForestRegressor': {
        'n_estimators': [100, 200, 300, 500, 1000],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'GradientBoostingRegressor': {
        'n_estimators': [100, 200, 300, 500, 1000],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7, 10]
    }
}

# Initialize models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42)
}

for model_name, model in models.items():
    print(f"\nTuning {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, scoring='neg_mean_absolute_error', verbose=1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {-grid_search.best_score_}")



Tuning LinearRegression...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters for LinearRegression: {}
Best score for LinearRegression: 3.14772052212996

Tuning Ridge...
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Best parameters for Ridge: {'alpha': 7.0}
Best score for Ridge: 3.12866805133494

Tuning RandomForestRegressor...
Fitting 5 folds for each of 60 candidates, totalling 300 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters for RandomForestRegressor: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best score for RandomForestRegressor: 1.4757409752759452

Tuning GradientBoostingRegressor...
Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best parameters for GradientBoostingRegressor: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500}
Best score for GradientBoostingRegressor: 1.4996786781382998
