<a href="https://colab.research.google.com/github/adonoho/Stats285_F23/blob/main/hw6/XGBoost_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install, Load, and Login to Weights and Biases

In [None]:
!pip install wandb -qU
import wandb
wandb.login()

# Load and Login to Google Big Query

In [None]:
# Google Colab
from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()
%load_ext google.colab.data_table

# Load packages

In [None]:
# load necessary functions

# Numpy
import numpy as np

# Scikit-Learn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# XGBoost
import xgboost as xgb

# Pandas
import pandas as pd

#Load Adult Income Dataset from Google Big Query

**In the next cell, find the string `SUID` and replace it with your actual Stanford ID. For example if your SUID were  `adonoho` you would edit in it to read `suid = "adonoho"`.**

In [None]:
suid = "suid"

In [None]:
# project_id and table_name are strings
# the following function loads the full dataset
# using standard SQL selection commands, we can get parts of the dataset also

def get_df_from_project(project_id, table_name):
  client = bigquery.Client(project=project_id)
  query = f"SELECT * FROM `{table_name}`"
  df = client.query(query).to_dataframe()
  return df

# Load the adult income dataframe
df = get_df_from_project('stanford-stats-285-donoho', 'XYZ.adult_income')

# Preview Data

In [None]:
df.head() # preview this dataframe

In [None]:
# Check the datatypes of the columns
df.dtypes

# Clean Data

We need to change all the categorical variables ("object"-types) into columns of 0-1 values indicating if an attribute is present.

For example, a column with entries "democrat", "republican", and "other" would be changed into *three* columns: in the first, there would be 1's indicating when a row is "democrat" and 0's everywhere else; in the second, 1's indicating a row is "republican" and 0's everywhere else; in the third, 1's indicating when a row is "other" and 0's everywhere else.

In [None]:
# Select object columns
object_cols = df.select_dtypes(include='object').columns

# One-hot encode these columns
df_encoded = pd.get_dummies(df, columns=object_cols)

# Preview
df_encoded.head()

# Create training and test data

Create training and testing sets at 80-20 split

In [None]:
# Data: All except the last two columns which are 0-1 columns for income
X = df_encoded.iloc[:,:-2]

# Target: Whether income is > 50K
y = df_encoded.iloc[:,-1]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Training and Logging

## Define experiment space to run over
`depth` determines the maximum depth of trees in XGBoost.

`lambda` is the size of the L2 (ridge) regularization.

In [None]:
depth_list = [6,8,10]
lambda_list = [0.25, 0.5, 1, 2, 4]

## Run the experiments

### Run **one setting** *(for illustration)*

In [None]:
depth = 4
lam = 1
num_rounds = 100  # Number of boosting rounds

model = xgb.XGBClassifier(eval_metric='logloss', n_estimators=num_rounds,
                          max_depth = depth, reg_lambda=lam)
#model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], verbose=1)
model.fit(X_train, y_train)

# Make predictions on the test set
test_preds = model.predict(X_test)
test_predictions = [1 if x > 0.5 else 0 for x in test_preds]
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")

# Make predictions of training set
train_preds = model.predict(X_train)
train_predictions = [1 if x > 0.5 else 0 for x in train_preds]
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Train Accuracy: {train_accuracy}")

### Run **all settings**

In [None]:
i = 0 # Index of experiments
for depth in depth_list:
  for lam in lambda_list:

    # 🐝 1️⃣ Start a new run to track this script
    wandb.init(
    project="xgboost-example2",
    name=f"experiment_{i}",

    # Track hyperparameters and run metadata
    config={
    "lambda": lam,
    "max_depth": depth,
    "method": "XGBoost",
    "dataset": "adult_income",
    "epochs": 100,
    })

    i += 1

    # Set up the parameters for XGBoost
    params = {
        'objective': 'binary:logistic',  # Change this if you have a different objective
        'eval_metric': 'logloss',  # Evaluation metric
        'max_depth': depth,
        'lambda': lam,
        # Other parameters
    }

    # Train the model
    num_rounds = 100  # Number of boosting rounds

    model = xgb.XGBClassifier(eval_metric='logloss',
                              max_depth = depth, reg_lambda=lam)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    test_preds = model.predict(X_test)
    test_predictions = [1 if x > 0.5 else 0 for x in test_preds]
    test_accuracy = accuracy_score(y_test, test_predictions)
    print(f"Test Accuracy: {test_accuracy}")

    train_preds = model.predict(X_train)
    train_predictions = [1 if x > 0.5 else 0 for x in train_preds]
    train_accuracy = accuracy_score(y_train, train_predictions)
    print(f"Train Accuracy: {train_accuracy}")

    # 🐝 2️⃣ Log metrics from your script to W&B
    wandb.log({"method": "XGBoost", "test_err": 1-test_accuracy, "train_err": 1-train_accuracy, "lambda":lam, "depth": depth})

    # Mark the run as finished
    wandb.finish()
