# Load the libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import xgboost as xgb

# Read the data

In [16]:
csv_file_path = "E:\IIT Chicago\Sem 3\Data Preparation and Analysis - CSP 571\Project\Final Submission\processed_data.csv"
df = pd.read_csv(csv_file_path)

  csv_file_path = "E:\IIT Chicago\Sem 3\Data Preparation and Analysis - CSP 571\Project\Final Submission\processed_data.csv"


# 1. Modify the target column

We are going to transform the target 'Class' column to have values 0,1,2 instead of 1,2,3

In [17]:
# 1. Make the target column values are encoded starting from 0
df['Class'] = df['Class'] - df['Class'].min()

# 2. Divide the dataset

In [18]:
# 2. Divide the dataset into X and y (y is 'Class' column), and stratify the data
X = df.drop(columns=['Class'])
y = df['Class']

# 3. Split the dataset & Calculate the class weights

To ensure the test dataset remains unseen by the model, the same random state used in the hyperparameter tuning process is applied when splitting the data.

In [19]:
# 3. Split it into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

# Pass class weights as sample weights
sample_weights = np.vectorize(class_weight_dict.get)(y_train)

# 4. Define the XGBoost model

In [22]:
# 4. Define the XGBoost model
model = XGBClassifier(eval_metric='aucpr', subsample = 0.9, seed = 42)

# 5. Model Training with Hyperparameter

These hyperparameters were obtained as the result of the hyperparameter tuning process.

In [23]:
best_params = {
    'max_depth': 7,
    'learning_rate': 0.2,
    'gamma': 0.15,
    'reg_lambda': 5
}

In [27]:
from sklearn.model_selection import StratifiedKFold

# Prepare the data in DMatrix format (needed for xgboost)
dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights)

# Set the parameters for training
param = {
    'objective': 'multi:softprob', 
    'num_class': len(np.unique(y_train)),  # Number of classes
    'eval_metric': 'aucpr',  # Use AUC PR (Area Under Precision-Recall Curve)
    **best_params  # Add any additional parameters found in your grid search or previous tuning
}



Cross-validation was used here to evaluate the model's performance across multiple subsets of the data, ensuring a more reliable estimate of its generalization ability by reducing variance and preventing overfitting.

In [28]:
# Set up cross-validation with StratifiedKFold
cv_results = xgb.cv(
    param,
    dtrain,
    num_boost_round=1000,  # Set a large number of rounds to allow early stopping
    nfold=5,  # Use 5-fold cross-validation
    early_stopping_rounds=10,  # Stop after 10 rounds of no improvement
    verbose_eval=True,  # Show evaluation metrics
    stratified=True,  # Ensure stratification for each fold
    metrics={'aucpr'}  # Use AUC PR as the evaluation metric
)

[0]	train-aucpr:0.80019+0.00015	test-aucpr:0.79859+0.00050
[1]	train-aucpr:0.80140+0.00031	test-aucpr:0.79949+0.00048
[2]	train-aucpr:0.80214+0.00060	test-aucpr:0.79987+0.00042
[3]	train-aucpr:0.80258+0.00071	test-aucpr:0.80009+0.00062
[4]	train-aucpr:0.80285+0.00076	test-aucpr:0.80013+0.00069
[5]	train-aucpr:0.80302+0.00069	test-aucpr:0.80011+0.00068
[6]	train-aucpr:0.80324+0.00071	test-aucpr:0.80010+0.00066
[7]	train-aucpr:0.80362+0.00059	test-aucpr:0.80043+0.00088
[8]	train-aucpr:0.80400+0.00034	test-aucpr:0.80072+0.00083
[9]	train-aucpr:0.80420+0.00032	test-aucpr:0.80074+0.00086
[10]	train-aucpr:0.80436+0.00023	test-aucpr:0.80075+0.00089
[11]	train-aucpr:0.80452+0.00021	test-aucpr:0.80072+0.00087
[12]	train-aucpr:0.80473+0.00029	test-aucpr:0.80077+0.00086
[13]	train-aucpr:0.80498+0.00017	test-aucpr:0.80073+0.00087
[14]	train-aucpr:0.80550+0.00032	test-aucpr:0.80075+0.00095
[15]	train-aucpr:0.80608+0.00030	test-aucpr:0.80084+0.00104
[16]	train-aucpr:0.80652+0.00027	test-aucpr:0.8010

In [29]:
# Print the cross-validation results
print("Cross-validation results:")
print(cv_results)

# Get the best iteration (best boosting round) based on the ROC AUC OVR (AUC PR)
best_iteration = cv_results['test-aucpr-mean'].idxmax()
print(f"Best Early Stopping Round (based on AUC PR): {best_iteration}")

# The ROC AUC OVR score corresponding to this best iteration
best_roc_auc_ovr = cv_results.loc[best_iteration, 'test-aucpr-mean']
print(f"Best ROC AUC OVR at Best Iteration: {best_roc_auc_ovr}")

Cross-validation results:
    train-aucpr-mean  train-aucpr-std  test-aucpr-mean  test-aucpr-std
0           0.800187         0.000148         0.798588        0.000497
1           0.801403         0.000308         0.799494        0.000475
2           0.802142         0.000596         0.799870        0.000420
3           0.802579         0.000714         0.800089        0.000621
4           0.802848         0.000757         0.800134        0.000691
5           0.803020         0.000694         0.800113        0.000676
6           0.803241         0.000708         0.800096        0.000665
7           0.803622         0.000593         0.800429        0.000877
8           0.804002         0.000340         0.800722        0.000829
9           0.804204         0.000316         0.800744        0.000856
10          0.804356         0.000231         0.800752        0.000894
11          0.804516         0.000209         0.800718        0.000873
12          0.804735         0.000290         0.800

# 6. Train the Full model

Train the model on the full training dataset with the early stopping round value obtained from previous step, also utilizing feature mapping, to enable model saving with the ONNX library.

In [30]:
# Define the custom feature mapping
feature_mapping = {"B": "f0", "D": "f1", "F": "f2", "I": "f3", "J": "f4", "L": "f5", "M": "f6"}

# Rename columns using the mapping
X_train_renamed = X_train.rename(columns=feature_mapping)
X_test_renamed = X_test.rename(columns=feature_mapping)

# Print the renamed column names for verification
print("Renamed X_train columns:", X_train_renamed.columns)
print("Renamed X_test columns:", X_test_renamed.columns)


Renamed X_train columns: Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6'], dtype='object')
Renamed X_test columns: Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6'], dtype='object')


In [32]:
# Prepare the data in DMatrix format (needed for xgboost)
dtrain = xgb.DMatrix(X_train_renamed, label=y_train, weight=sample_weights)

best_num_boost_round = cv_results.shape[0]

model = xgb.train(
    params=param,
    dtrain=dtrain,
    num_boost_round=best_num_boost_round
)


# 7: Evaluate model performance on the test set

In [46]:
# Step 7: Evaluate model performance on the test set

# First, prepare the test set with sample weights
sample_weights_test = np.vectorize(class_weight_dict.get)(y_test)

# Prepare the test data in DMatrix format
dtest = xgb.DMatrix(X_test_renamed, weight=sample_weights_test)

# Predict probabilities on the test set using the best iteration (use softprob for probabilities)
y_pred_prob = model.predict(dtest, iteration_range=(0, best_iteration), output_margin=False)

In [47]:
auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr', average='macro')
print(f"AUC Score on Test Data: {auc:.4f}")

AUC Score on Test Data: 0.8920


In [48]:
# Classification report for detailed metrics (convert probabilities to class labels)
from sklearn.metrics import classification_report
y_pred_classes = y_pred_prob.argmax(axis=1)  # Convert probabilities to class labels
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))  # Display detailed classification metrics

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.56      0.72     36119
           1       0.75      1.00      0.86     89977
           2       0.84      0.74      0.79    113904

    accuracy                           0.81    240000
   macro avg       0.87      0.77      0.79    240000
weighted avg       0.83      0.81      0.80    240000



# 8: Comparing model performance with Naive Bayes model

The basic Naive Bayes model is trained on the original dataset, and performance metrics comparable to those of the XGBoost model are obtained.

In [30]:
from sklearn.naive_bayes import GaussianNB

data = pd.read_csv("E:\IIT Chicago\Sem 3\Data Preparation and Analysis - CSP 571\Project\Final Submission\data_public.csv")


# Split the data into features and target
X = data.drop('Class', axis=1)  # Features
y = data['Class']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Naive Bayes classifier
model = GaussianNB()
model.fit(X_train, y_train)

# Predict probabilities on the test set (needed for AUCPR)
y_pred_prob = model.predict_proba(X_test)  # predict_proba gives probabilities

  data = pd.read_csv("E:\IIT Chicago\Sem 3\Data Preparation and Analysis - CSP 571\Project\Final Submission\data_public.csv")


In [None]:
from sklearn.metrics import classification_report, average_precision_score
from sklearn.preprocessing import label_binarize

# Calculate AUCPR (Area Under the Precision-Recall Curve) 
y_test_binary = label_binarize(y_test, classes=np.unique(y))  

# Compute AUCPR for the positive class 
aupr = average_precision_score(y_test_binary, y_pred_prob, average='macro')
print(f"AUCPR: {aupr:.4f}")

# Classification report
y_pred_classes = model.predict(X_test)  
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))

AUCPR: 0.6396

Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.56      0.53     36189
           2       0.75      1.00      0.86     89845
           3       0.80      0.56      0.66    113966

    accuracy                           0.72    240000
   macro avg       0.68      0.71      0.68    240000
weighted avg       0.74      0.72      0.71    240000



# 9: Download and save the model

In [49]:
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType

In [50]:
# Define the input type for ONNX
initial_type = [('float_input', FloatTensorType([None, X_train_renamed.shape[1]]))]

# Convert the model to ONNX
onnx_model = onnxmltools.convert_xgboost(model, initial_types=initial_type)

In [51]:
import os

# Specify the directory and model name
model_dir = r"E:\IIT Chicago\Sem 3\Data Preparation and Analysis - CSP 571\Project\Final Submission"   # Replace with your desired directory path
model_name = "xgboost_model.onnx"  # You can change the model name if needed

# Create the directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

# Combine directory and model name into a full path
onnx_file_path = os.path.join(model_dir, model_name)

# Save the ONNX model to a file
with open(onnx_file_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"Model saved to {onnx_file_path}")


Model saved to E:\IIT Chicago\Sem 3\Data Preparation and Analysis - CSP 571\Project\Final Submission\xgboost_model.onnx
