In [1]:
from google.colab import files

# Upload the zip file
uploaded = files.upload()

Saving processed_data.zip to processed_data.zip


In [2]:
import zipfile
import os

# Replace 'your_file.zip' with the name of your uploaded file
zip_file_name = 'processed_data.zip'

# Unzip the file
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('extracted_data')  # Extract to a folder named 'extracted_data'

# Verify extracted files
print(os.listdir('extracted_data'))

['processed_data.csv']


In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.3


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import xgboost as xgb

In [5]:
data = pd.read_csv('/content/extracted_data/processed_data.csv')

# 1. Modify the taget column

In [6]:
# 1. Make the target column values are encoded starting from 0
data['Class'] = data['Class'] - data['Class'].min()

# 2. Divide the dataset

In [7]:
# 2. Divide the dataset into X and y (y is 'Class' column), and stratify the data
X = data.drop(columns=['Class'])
y = data['Class']

# 3. Split the dataset & Calculate the class weights

In [8]:
# 3. Split it into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

# Pass class weights as sample weights
sample_weights = np.vectorize(class_weight_dict.get)(y_train)

# 4. Define the XGBoost model

In [11]:
# 4. Define the XGBoost model
model = XGBClassifier(eval_metric='aucpr', subsample = 0.9, seed = 42)

# 5. Use GridSearchCV to find the best parameters

## Round 1

In [12]:
param_grid_1 = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2],
    'reg_lambda': [1, 5, 10]
}

In [13]:
grid_search_round1 = GridSearchCV(
    estimator=model,
    param_grid=param_grid_1,
    scoring='roc_auc_ovr',
    cv=StratifiedKFold(n_splits=5),
    verbose=2,
    n_jobs=-1
)

In [14]:
grid_search_round1.fit(X_train, y_train, **{'sample_weight': sample_weights})
best_params_1 = grid_search_round1.best_params_
print("Best Parameters:", best_params_1)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 7, 'reg_lambda': 5}


## Round 2

In [15]:
param_grid_2 = {
    'max_depth': [7, 8, 10],
    'learning_rate': [0.16, 0.2, 0.3],
    'gamma': [0.06, 0.1, 0.15],
    'reg_lambda': [4, 5, 6]
}

In [16]:
grid_search_round2 = GridSearchCV(
    estimator=model,
    param_grid=param_grid_2,
    scoring='roc_auc_ovr',
    cv=StratifiedKFold(n_splits=5),
    verbose=2,
    n_jobs=-1
)

In [17]:
grid_search_round2.fit(X_train, y_train, **{'sample_weight': sample_weights})
best_params_round2 = grid_search_round2.best_params_
print("Best Parameters after Round 2:", best_params_round2)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters after Round 2: {'gamma': 0.15, 'learning_rate': 0.2, 'max_depth': 7, 'reg_lambda': 5}


### **Since the parameters remained unchanged in the second round compared to the first, we can conclude that the parameters have converged.**