# Task 2
## 1. Env set up
1. ... (already done before)
2. pip install scikit-learn(including sklearn, scipy) torch
---
## 2. Learning/Working route
1. figure out task-involved knowledge range, in this case package calling
2. watch some short videos about these package to quickly get knowing what they do, have, and their advantages
3. check their usage in the official documents
4. learn new concepts like OneHot encoding, ask AI about its common realizations, and learn related methods
5. ask AI about common ways of processing numerical data and do it under guidance
6. when visualizing, using AI to help adjust the parameters
---
## 3. Work Sequentially
#### a. Before start
- numpy can read only numerical data, still use pandas to read
- preprocess before further action:
    1. filling missing data(with mean)
    2. encode categorical features and 'international' col(using LabelEncoder)
    3. prepare target var

In [1]:
import numpy as np
import scipy as sp
import sklearn as sk
import pandas as pd

# hyperparameters
lr = 0.0001
epoch = 50
batch = 64

# read data
train_data = pd.read_csv('../MBAAdmission/train.csv')
test_data = pd.read_csv('../MBAAdmission/test.csv')

# preprocess data
def preprocess(data):
    # handle missing values
    num_cols = data.select_dtypes(include=[np.number]).columns
    data[num_cols] = data[num_cols].fillna(data[num_cols].mean())
    for col in data.select_dtypes(include=[object]).columns:
        if col != 'admission':
            data[col]=data[col].fillna(data[col].mode()[0])
    return

preprocess(train_data)
preprocess(test_data)

categorical_cols = train_data.select_dtypes(include=[object]).columns
numeric_cols = train_data.select_dtypes(include=[np.number]).columns

# prepare features and labels

# encode categorical variables
X_train = train_data.drop(columns=['application_id', 'admission'])
X_test = test_data.drop(columns=['application_id', 'admission'])

# simple label encoding for categorical variables
for col in categorical_cols:
    if col in X_train.columns:
        le = sk.preprocessing.LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        # handle unseen labels in test set
        if col in X_test.columns:
            X_test[col] = X_test[col].astype(str)
            X_test[col] = X_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
            
# encode 'international' column
X_train['international'] = X_train['international'].astype(int)
X_test['international'] = X_test['international'].astype(int)

# target variable
y_encoder = sk.preprocessing.LabelEncoder()
y_train = y_encoder.fit_transform(train_data['admission'])

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Classes: {y_encoder.classes_}")

Training set shape: (6095, 8)
Test set shape: (99, 8)
Classes: ['Admit' 'Reject' 'Waitlist']


  data[col]=data[col].fillna(data[col].mode()[0])


#### b. Subtask 1
- call sk..LinearRegression, fit, predict and assess.
- call LogisticReg, got warned that 1000 iter woundn't lead to convergence,
try 3000, the same; 4000, converged.
- output accuracy on both sets.


In [2]:
# Subtask 1: Linear and Logistic Regression
# a. Linear Regression
print("\n--- Linear Regression ---")
linear_reg = sk.linear_model.LinearRegression()
linear_reg.fit(X_train, y_train)

# make predictions
y_train_pred_linear = linear_reg.predict(X_train)
y_test_pred_linear = linear_reg.predict(X_test)

# convert to int for classification
y_train_pred_linear_class = np.round(np.clip(y_train_pred_linear, 0, len(y_encoder.classes_)-1)).astype(int)
y_test_pred_linear_class = np.round(np.clip(y_test_pred_linear, 0, len(y_encoder.classes_)-1)).astype(int)

# Calculate accuracy
train_acc_linear = sk.metrics.accuracy_score(y_train, y_train_pred_linear_class)
test_acc_linear = sk.metrics.accuracy_score(y_encoder.transform(test_data['admission']), y_test_pred_linear_class)

print(f"   Training Accuracy: {train_acc_linear:.4f}")
print(f"   Test Accuracy: {test_acc_linear:.4f}")
print(f"   Linear regression coefficients shape: {linear_reg.coef_.shape}")


# b. Logistic Regression
print("\n--- Logistic Regression ---")
logistic_reg = sk.linear_model.LogisticRegression(max_iter=4000, random_state=42)
logistic_reg.fit(X_train, y_train)

# make predictions
y_train_pred_logistic = logistic_reg.predict(X_train)
y_test_pred_logistic = logistic_reg.predict(X_test)

# Calculate accuracy
train_acc_logistic = sk.metrics.accuracy_score(y_train, y_train_pred_logistic)
test_acc_logistic = sk.metrics.accuracy_score(y_encoder.transform(test_data['admission']), y_test_pred_logistic)

print(f"   Training Accuracy: {train_acc_logistic:.4f}")
print(f"   Test Accuracy: {test_acc_logistic:.4f}")
print(f"   Logistic regression coefficients shape: {logistic_reg.coef_.shape}")

print("\nRegression Done.")


--- Linear Regression ---
   Training Accuracy: 0.8466
   Test Accuracy: 0.3333
   Linear regression coefficients shape: (8,)

--- Logistic Regression ---
   Training Accuracy: 0.8409
   Test Accuracy: 0.3838
   Logistic regression coefficients shape: (3, 8)

Regression Done.


#### c. Subtask 2
- learn the common process using sklearn to train a MLP model
- normalize for better training
- build model
- random_state and '42': a widely used meme seed LOL
- Glad to see 'adam' again btw.

In [3]:
# Subtask 2: sklearn MLP Classifier
print("\n--- MLP Classifier ---")

# normalize features
scaler = sk.preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Scaled Training set shape: {X_train_scaled.shape}")
print(f"Scaled Test set shape: {X_test_scaled.shape}")

# build and train MLP using provided parameters
input_dim = X_train_scaled.shape[1]
output_dim = len(y_encoder.classes_)

print(f"Network Architecture: [{input_dim}, 128] -> [128, 256] -> [256, {output_dim}]")

# build model
mlp = sk.neural_network.MLPClassifier(
    hidden_layer_sizes=(128, 256),  # Two hidden layers: 128 and 256 neurons
    activation='relu',              # ReLU activation
    learning_rate_init=lr,      # lr = 0.0001
    max_iter=epoch,                    # epoch = 50
    batch_size=batch,                  # batch = 64
    random_state=42,
    solver='adam',                  # Adam optimizer (sklearn default)
    early_stopping=False,           # Don't stop early to complete all epochs
    verbose=True                    # Show training progress
)

# train model
print("Training MLP...")
mlp.fit(X_train_scaled, y_train)
print("MLP Training Complete.")

# make predictions
y_train_pred_mlp = mlp.predict(X_train_scaled)
y_test_pred_mlp = mlp.predict(X_test_scaled)

# Calculate accuracy
train_acc_mlp = sk.metrics.accuracy_score(y_train, y_train_pred_mlp)
test_acc_mlp = sk.metrics.accuracy_score(y_encoder.transform(test_data['admission']), y_test_pred_mlp)
print(f"   Training Accuracy: {train_acc_mlp:.4f}")
print(f"   Test Accuracy: {test_acc_mlp:.4f}")
total_params = sum(coef.size for coef in mlp.coefs_) + sum(bias.size for bias in mlp.intercepts_)
print(f"   Total parameters: {total_params}")
print("MLP Done.")


--- MLP Classifier ---
Scaled Training set shape: (6095, 8)
Scaled Test set shape: (99, 8)
Network Architecture: [8, 128] -> [128, 256] -> [256, 3]
Training MLP...
Iteration 1, loss = 0.67510977
Iteration 2, loss = 0.43760726
Iteration 3, loss = 0.40227626
Iteration 4, loss = 0.39255596
Iteration 5, loss = 0.38850201
Iteration 6, loss = 0.38602716
Iteration 7, loss = 0.38351205
Iteration 8, loss = 0.38231942
Iteration 9, loss = 0.38056649
Iteration 10, loss = 0.37944881
Iteration 11, loss = 0.37815923
Iteration 12, loss = 0.37692867
Iteration 13, loss = 0.37576888
Iteration 14, loss = 0.37473733
Iteration 15, loss = 0.37371770
Iteration 16, loss = 0.37291974
Iteration 17, loss = 0.37199046
Iteration 18, loss = 0.37133703
Iteration 19, loss = 0.37025144
Iteration 20, loss = 0.36922436
Iteration 21, loss = 0.36881741
Iteration 22, loss = 0.36800906
Iteration 23, loss = 0.36734866
Iteration 24, loss = 0.36671220
Iteration 25, loss = 0.36567163
Iteration 26, loss = 0.36491075
Iteration 27



#### d. Subtask 3
- install torch
- search and see what 'super' class is: a func allows calling methods from parent class
- why commonly a class is built using Torch but not sklearn: PyTorch is more customizable while scikit-learn provides standard ML.
- adopt the same optimizer 'Adam' as above
- learn the training process using Torch
- learn some apis

In [4]:
# Subtask 3: Torch MLP Classifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# define MLP model
class MLP_PyTorch(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(MLP_PyTorch, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256), 
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        return self.network(x)

# prepare data for PyTorch
X_train_tensor = torch.FloatTensor(X_train_scaled)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_train_tensor = torch.LongTensor(y_train)

# create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# initialize model, loss function, optimizer
model = MLP_PyTorch(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# train model
print("\nTraining PyTorch MLP...")
model.train()
for epoch in range(epoch):
    running_loss = 0.0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_x.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{epoch}, Loss: {epoch_loss:.4f}")
    
print("PyTorch MLP Training Complete.")

# evaluate model
model.eval()
with torch.no_grad():
    # Training accuracy
    train_outputs = model(X_train_tensor)
    _, train_preds = torch.max(train_outputs, 1)
    train_acc_torch = (train_preds == y_train_tensor).float().mean().item()
    
    # Test accuracy
    test_outputs = model(X_test_tensor)
    _, test_preds = torch.max(test_outputs, 1)
    test_acc_torch = (test_preds == torch.LongTensor(y_encoder.transform(test_data['admission']))).float().mean().item()
    
print(f"   Training Accuracy: {train_acc_torch:.4f}")
print(f"   Test Accuracy: {test_acc_torch:.4f}")
total_params_torch = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"   Total parameters: {total_params_torch}")

print("PyTorch MLP Done.")


# compare sklearn and PyTorch
print("\n--- Comparison ---")
print(f"Sklearn MLP - Training Accuracy: {train_acc_mlp:.4f}, Test Accuracy: {test_acc_mlp:.4f}, Total Params: {total_params}")
print(f"PyTorch MLP - Training Accuracy: {train_acc_torch:.4f}, Test Accuracy: {test_acc_torch:.4f}, Total Params: {total_params_torch}")
# optimization rate
opt_rate_train = (train_acc_torch - train_acc_mlp) / train_acc_mlp * 100
opt_rate_test = (test_acc_torch - test_acc_mlp) / test_acc_mlp * 100
print(f"Optimization Rate - Torch over Sklearn\n- Training: {opt_rate_train:.2f}%, Test: {opt_rate_test:.2f}%")
print("Comparison Done.")



Training PyTorch MLP...
Epoch 1/0, Loss: 0.8086
Epoch 2/1, Loss: 0.4757
Epoch 3/2, Loss: 0.4118
Epoch 4/3, Loss: 0.3937
Epoch 5/4, Loss: 0.3885
Epoch 6/5, Loss: 0.3860
Epoch 7/6, Loss: 0.3843
Epoch 8/7, Loss: 0.3827
Epoch 9/8, Loss: 0.3814
Epoch 10/9, Loss: 0.3801
Epoch 11/10, Loss: 0.3791
Epoch 12/11, Loss: 0.3780
Epoch 13/12, Loss: 0.3770
Epoch 14/13, Loss: 0.3759
Epoch 15/14, Loss: 0.3753
Epoch 16/15, Loss: 0.3743
Epoch 17/16, Loss: 0.3737
Epoch 18/17, Loss: 0.3731
Epoch 19/18, Loss: 0.3718
Epoch 20/19, Loss: 0.3713
Epoch 21/20, Loss: 0.3705
Epoch 22/21, Loss: 0.3692
Epoch 23/22, Loss: 0.3692
Epoch 24/23, Loss: 0.3680
Epoch 25/24, Loss: 0.3668
Epoch 26/25, Loss: 0.3662
Epoch 27/26, Loss: 0.3656
Epoch 28/27, Loss: 0.3649
Epoch 29/28, Loss: 0.3642
Epoch 30/29, Loss: 0.3632
Epoch 31/30, Loss: 0.3626
Epoch 32/31, Loss: 0.3616
Epoch 33/32, Loss: 0.3608
Epoch 34/33, Loss: 0.3599
Epoch 35/34, Loss: 0.3593
Epoch 36/35, Loss: 0.3587
Epoch 37/36, Loss: 0.3587
Epoch 38/37, Loss: 0.3571
Epoch 

#### Analysis:
##### 1. diff between sklearn and PyTorch
1. Different initialization schemes
2. Different optimization algorithms (sklearn vs PyTorch Adam)
3. Different batch handling
4. Different numerical precision
5. Different regularization defaults
##### 2. diff between train/test accuracy likely due to
1. Overfitting - model memorizes training data
2. Limited training data
3. Model complexity too high for dataset size
4. Distribution shift between train and test sets