In [3]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 2.1 MB/s eta 0:01:12
   ---------------------------------------- 0.5/150.0 MB 2.1 MB/s eta 0:01:12
   ---------------------------------------- 0.8/150.0 MB 1.4 MB/s eta 0:01:47
   ---------------------------------------- 1.0/150.0 MB 1.1 MB/s eta 0:02:14
   ---------------------------------------- 1.0/150.0 MB 1.1 MB/s eta 0:02:14
   ---------------------------------------- 1.3/150.0 MB 986.4 kB/s eta 0:02:31
   ---------------------------------------- 1.6/150.0 MB 942.3 kB/s eta 0:02:38
   ---------------------------------------- 1.8/150.0 MB 986.7 kB/s eta 0:02:31
    --------------------------------------- 2.1/150.0 MB 1.0 MB/s eta 0:02:28
    --------------------------------------- 2.4/150.0 MB 1.0 MB/s eta 0:0

In [4]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

###############################################################################
# 1) DEFINE YOUR BEST PARAMETERS
###############################################################################
best_params = {
    "eta": 0.058750805632663855,
    "max_depth": 14,
    "min_child_weight": 8,
    "subsample": 0.8573692394884126,
    "colsample_bytree": 0.5112956650097914,
    "gamma": 0.12690348538351684,
    "lambda": 0.0020353232412188764,
    "alpha": 0.3367680481657853,
    "grow_policy": "lossguide",
}

###############################################################################
# 2) LOAD & PREPARE YOUR DATA (Replace with your code)
###############################################################################
# Example: Suppose you have a CSV with features + a 'Prediction' column
df = pd.read_csv("transformed_training_data.csv")

# Identify features & target
target_col = "Prediction"
features = [c for c in df.columns if c != target_col]

X = df[features].copy()
y = df[target_col].copy()

# If you have categorical columns, handle them (e.g., one-hot encode) or skip if not needed.
# cat_cols = X.select_dtypes(include=['object']).columns
# if len(cat_cols) > 0:
#     X = pd.get_dummies(X, columns=cat_cols)

# Example: If you want to do more advanced preprocessing, do it here
# e.g., scaling, KNN imputation, etc.

###############################################################################
# 3) SPLIT DATA & (OPTIONALLY) APPLY SMOTE
###############################################################################
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# If your data is imbalanced, apply SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Train shape:", X_train.shape, "Validation shape:", X_val.shape)

###############################################################################
# 4) BUILD & TRAIN THE XGBOOST MODEL WITH BEST PARAMS
###############################################################################
# We'll assume it's a multi-class problem; adjust objective as needed.
# For binary classification, set objective="binary:logistic"
# For multi-class, set objective="multi:softprob" and specify num_class=...
# For example:
best_params["objective"] = "multi:softprob"  # or "binary:logistic"
best_params["num_class"] = len(np.unique(y))  # if multi-class

# We also specify use_label_encoder=False & eval_metric for new XGBoost versions
model = xgb.XGBClassifier(
    **best_params,
    random_state=42,
    use_label_encoder=False,  # recommended for newer xgboost
    eval_metric="mlogloss"    # example metric
)

print("Training the XGBoost model with best parameters...")
model.fit(X_train, y_train)
print("Training complete.")

###############################################################################
# 5) EVALUATE ON THE VALIDATION SET
###############################################################################
y_pred = model.predict(X_val)

acc = accuracy_score(y_val, y_pred)
print(f"\nValidation Accuracy: {acc:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred))


Train shape: (56254, 66) Validation shape: (12000, 66)
Training the XGBoost model with best parameters...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

Validation Accuracy: 0.8539

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      1127
           1       0.93      0.97      0.95      1279
           2       0.88      0.93      0.90      1129
           3       0.83      0.86      0.84      1163
           4       0.83      0.85      0.84      1111
           5       0.82      0.89      0.85      1026
           6       0.90      0.95      0.92      1127
           7       0.86      0.92      0.89      1186
           8       0.81      0.85      0.83      1114
           9       0.78      0.83      0.81      1130
          10       0.07      0.00      0.01       608

    accuracy                           0.85     12000
   macro avg       0.78      0.82      0.80     12000
weighted avg       0.82      0.85      0.83     12000



In [6]:
import pickle 

with open("xgb1_model.pkl",'wb') as f:
    pickle.dump(model,f)