# CS6140 Final Project: XGBoost Integration

In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import fetch_california_housing

from xgboost_fully_enhanced import train_xgboost_classifier, train_xgboost_regressor

# Dowload 'UCI Adult Income' dataset for classification
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df.dropna(inplace=True)

# Label encode + normalization
df['income'] = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)
for col in df.select_dtypes(include='object').columns.drop('income', errors='ignore'):
    df[col] = LabelEncoder().fit_transform(df[col])

X = df.drop('income', axis=1).values
y = df['income'].values
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# Run XGBoost Classification
print("=== XGBoost Classification on UCI Adult Dataset ===")
feature_names = df.drop('income', axis=1).columns.tolist()
xgb_clf = train_xgboost_classifier(X, y, feature_names=feature_names, n_splits=5, plot=True)

# Dowload 'California Housing' dataset for regression
data = fetch_california_housing()
X, y = data.data, data.target
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Run XGBoost Regression
print("=== XGBoost Regression on California Housing Dataset ===")
feature_names = data.feature_names
xgb_reg = train_xgboost_regressor(X, y, feature_names=feature_names, n_splits=5, plot=True)

from sklearn.metrics import root_mean_squared_error , accuracy_score
# helper function for evaluation
def train_and_evaluate_tree_model(X_train, y_train, X_test, y_test, task='classification'):
    if task == 'classification':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100)
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor(n_estimators=100)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if task == 'classification':
        acc = accuracy_score(y_test, y_pred)
        print("Accuracy:", acc)
    else:
        rmse = root_mean_squared_error(y_test, y_pred, squared=False)
        print("RMSE:", rmse)

# MLP Model
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dims=[64, 32], task='classification', dropout_p=0.3):
        super(FeedforwardNN, self).__init__()
        self.task = task

        layers = []
        prev_dim = input_dim

        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_p))
            prev_dim = h_dim

        self.hidden_layers = nn.Sequential(*layers)
        self.output_layer = nn.Linear(prev_dim, 1)

    def forward(self, x):
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        if self.task == 'classification':
            x = torch.sigmoid(x)
        return x.squeeze(1)

input_dim = 14
model = FeedforwardNN(input_dim=input_dim, task='classification')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
