## CS6140 Final project 
### Can Tree-Based Models Always Outperform Neural Networks on Tabular Data?
#### Author: Yuxuan Wang and Pranav Sirnapalli

## Data Loading and Preprocessing 


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import fetch_california_housing

# Dowload 'UCI Adult Income' dataset for classification
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]
df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df.dropna(inplace=True)

# Label encode + normalization
df['income'] = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)
for col in df.select_dtypes(include='object').columns.drop('income', errors='ignore'):
    df[col] = LabelEncoder().fit_transform(df[col])

X = df.drop('income', axis=1).values  # transform to numpy
y = df['income'].values

X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)



# # Dowload 'California Housing' dataset for regression
# data = fetch_california_housing()
# X, y = data.data, data.target

# X = StandardScaler().fit_transform(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [10]:
from sklearn.metrics import root_mean_squared_error , accuracy_score
# helper function for evluation
def train_and_evaluate_tree_model(X_train, y_train, X_test, y_test, task='classification'):
    if task == 'classification':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100)
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor(n_estimators=100)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if task == 'classification':
        acc = accuracy_score(y_test, y_pred)
        print("Accuracy:", acc)
    else:
        rmse = root_mean_squared_error(y_test, y_pred, squared=False)
        print("RMSE:", rmse)


## Part 1: Classification – UCI Adult
- Tree: RandomForest / XGBoost
- MLP: PyTorch MLPClassifier
- Metric: Accuracy / AUC

In [6]:
## Customed Tree model 


In [11]:
## Model for MLP(will use library to implement)
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dims=[64, 32], task='classification', dropout_p=0.3):
        super(FeedforwardNN, self).__init__()
        self.task = task
        
        layers = []
        prev_dim = input_dim
        
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_p))
            prev_dim = h_dim
        
        self.hidden_layers = nn.Sequential(*layers)
        
        self.output_layer = nn.Linear(prev_dim, 1)
    def forward(self, x):
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        
        if self.task == 'classification':
            x = torch.sigmoid(x)
        
        return x.squeeze(1)  

In [18]:
input_dim = 14  
model = FeedforwardNN(input_dim=input_dim, task='classification')


optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


criterion = nn.BCELoss() 

In [20]:
import numpy as np
from collections import Counter

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="gini", max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.max_features = max_features
        self.tree = None

    def fit(self, X, y):
        self.n_classes = len(set(y))
        self.n_features = X.shape[1]
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        if (depth >= self.max_depth) or (n_samples < self.min_samples_split) or (len(set(y)) == 1):
            return self._most_common_label(y)

        best_feat, best_thresh = self._best_split(X, y)
        if best_feat is None:
            return self._most_common_label(y)

        left_idx = X[:, best_feat] < best_thresh
        right_idx = X[:, best_feat] >= best_thresh
        left = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right = self._build_tree(X[right_idx], y[right_idx], depth + 1)
        return (best_feat, best_thresh, left, right)

    def _best_split(self, X, y):
        n_features = X.shape[1]

        # 决定要使用多少个特征
        if self.max_features is None:
            features = range(n_features)
        else:
            max_feats = min(self.max_features, n_features)
            features = np.random.choice(n_features, max_feats, replace=False)

        best_gain = -1
        best_feat, best_thresh = None, None

        for feat in features:
            thresholds = np.unique(X[:, feat])
            for thresh in thresholds:
                left_idx = X[:, feat] < thresh
                right_idx = X[:, feat] >= thresh
                if len(y[left_idx]) == 0 or len(y[right_idx]) == 0:
                    continue
                gain = self._gini_gain(y, left_idx, right_idx)
                if gain > best_gain:
                    best_gain = gain
                    best_feat, best_thresh = feat, thresh
        return best_feat, best_thresh

    def _gini(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        return 1 - np.sum(probs ** 2)

    def _gini_gain(self, y, left_idx, right_idx):
        n = len(y)
        gini_left = self._gini(y[left_idx])
        gini_right = self._gini(y[right_idx])
        return self._gini(y) - (len(left_idx) / n) * gini_left - (len(right_idx) / n) * gini_right

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])

    def _predict_one(self, x, tree):
        if not isinstance(tree, tuple):
            return tree
        feat, thresh, left, right = tree
        return self._predict_one(x, left if x[feat] < thresh else right)


class RandomForestClassifier:
    def __init__(self, n_estimators=10, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            idxs = np.random.choice(len(X), len(X), replace=True)
            X_sample, y_sample = X[idxs], y[idxs]
            tree = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(X.shape[0])])


### Evaluation and Visualization

In [21]:
# from RandomForestClassifier import RandomForestClassifier
from RandomForestRegressor import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, r2_score

def run_rf_classifier(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(n_estimators=100, max_depth=10, max_features=None)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    tree_preds = np.array([tree.predict(X_test) for tree in model.trees])  # shape: (n_trees, n_samples)
    y_proba = np.mean(tree_preds, axis=0)  # 平均投票概率

    try:
        auc = roc_auc_score(y_test, y_proba)
    except Exception:
        auc = None  # 如果出错就不计算
    
    return {
        "Accuracy": acc,
        "F1": f1,
        "AUC": auc
    }

def run_rf_regressor(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2_score(y_test, y_pred)
    }


results = run_rf_classifier(X_train, X_test, y_train, y_test)

# 显示为清晰的表格
import pandas as pd
pd.DataFrame([results], index=["Random Forest (Classifier)"])

KeyboardInterrupt: 

## Part 2: Regression – California Housing
- Tree: RandomForestRegressor / XGBoostRegressor
- MLP: PyTorch MLPRegressor
- Metric: RMSE / R²