### 1. Import Packages

In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score


### 2. Load Dataset

In [3]:
# Load the training and validation data
X_train = pd.read_csv('../data/processed/X_train_master.csv')
X_test = pd.read_csv('../data/processed/X_test_master.csv')
y_train = pd.read_csv('../data/processed/y_train_master.csv')
df_test = pd.read_csv('../data/interim/test_cleaned_1.csv')


In [4]:
# Convert target variables to arrays
y_train = y_train.values.ravel()

In [5]:
y_train

array([0., 0., 0., ..., 0., 0., 0.], shape=(46314,))

In [6]:
X_train.columns

Index(['yr_Jr', 'yr_So', 'yr_Sr', 'GP', 'Min_per', 'Ortg', 'usg', 'eFG',
       'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM', 'FTA',
       'FT_per', 'twoPM', 'twoPA', 'twoP_per', 'TPM', 'TPA', 'TP_per',
       'blk_per', 'stl_per', 'ftr', 'ht', 'porpag', 'adjoe', 'pfr', 'Rec_Rank',
       'ast_tov', 'rimmade', 'rimmade_rimmiss', 'midmade', 'midmade_midmiss',
       'rim_ratio', 'mid_ratio', 'dunksmade', 'dunksmiss_dunksmade',
       'dunks_ratio', 'pick', 'drtg', 'adrtg', 'dporpag', 'stops', 'bpm',
       'obpm', 'dbpm', 'gbpm', 'mp', 'ogbpm', 'dgbpm', 'oreb', 'dreb', 'treb',
       'ast', 'stl', 'blk', 'pts'],
      dtype='object')

In [7]:
# Split training data for validation
# Perform stratified train-test split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=42)

In [23]:
# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', n_estimators =150,
                             learning_rate = 0.01, max_depth = 4, subsample = 1, colsample_bytree = 0.7,
                             gamma = 1)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)


In [24]:
# Store model using dump
from joblib import dump
dump(xgb_model,  '../models/xgb_best.joblib')

['../models/xgb_best.joblib']

In [25]:
#Predict the probabilities on the validation set
y_val_probs = xgb_model.predict_proba(X_val)[:, 1]  # Get probabilities for the positive class (class 1)

#Predict the probabilities on the training set
y_train_probs = xgb_model.predict_proba(X_train)[:, 1]  # Get probabilities for the positive class (class 1)

In [26]:
#Calculate the Training AUC score
auc_score = roc_auc_score(y_train, y_train_probs)
print(f"AUC Score: {auc_score:.4f}")

AUC Score: 0.9987


In [27]:
#Calculate the Validation AUC score
auc_score = roc_auc_score(y_val, y_val_probs)
print(f"AUC Score: {auc_score:.4f}")

AUC Score: 0.9977


In [12]:
# Predict probabilities for the test set
y_test_pred_proba = xgb_model.predict_proba(X_test)[:, 1]


In [13]:
# Create the output DataFrame
output_df = df_test[['player_id']].copy()  # Ensure 'player_id' is in the test dataset
output_df['drafted'] = y_test_pred_proba

In [14]:
output_df

Unnamed: 0,player_id,drafted
0,23549e01-c1b3-4ca0-a0fd-de9b5d76276b,1.448488e-06
1,52a518bb-b34a-4b43-adee-5e996cb853fa,1.077506e-06
2,ad3d9117-b6bf-4675-ab97-3497acf3e555,7.502640e-06
3,eaf66a5c-6f4c-4070-bc70-a99d731b3740,3.635552e-06
4,55d07491-5bd1-447f-844e-9cb36eaa442e,1.057046e-04
...,...,...
4965,28222513-8a1f-4a48-8fde-16888e9e11ce,4.165525e-04
4966,c32a466d-7a66-47eb-805c-a94e328261bc,1.072063e-06
4967,55f0ddef-9f29-47ae-87b5-da43c687d25c,8.080187e-07
4968,460d6a42-5dbc-48f0-bc94-3650da83f345,8.080187e-07


In [15]:
output_df.to_csv('../data/external/output_xgboost_3.csv', index=False)