In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset, DataLoader

**--- Data Preparation ---**

In [5]:
df = pd.read_csv('adult.csv')
# Clean and preprocess data
df.replace(' ?', np.nan, inplace=True)
df.dropna(how='any', inplace=True)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()
df = df.drop(columns=['Capital Gain', 'capital loss'])

In [6]:
# Separate columns
categorical_cols = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Gender', 'Native Country']
continuous_cols = ['Age', 'Final Weight', 'EducationNum', 'Hours per Week']
label_col = 'Income'

# Encode categorical features and target label
categorical_dims = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    categorical_dims[col] = len(le.classes_)

le_income = LabelEncoder()
df[label_col] = le_income.fit_transform(df[label_col])

In [7]:
# Split data
X = df[categorical_cols + continuous_cols].values
y = df[label_col].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=25000, test_size=5000, random_state=42, stratify=y
)

In [8]:
# Scale continuous features
scaler = StandardScaler()
X_train_cont = scaler.fit_transform(X_train[:, len(categorical_cols):])
X_test_cont = scaler.transform(X_test[:, len(categorical_cols):])
X_train_cat = X_train[:, :len(categorical_cols)]
X_test_cat = X_test[:, :len(categorical_cols)]

# Convert to tensors
X_train_cat_tensor = torch.LongTensor(X_train_cat)
X_train_cont_tensor = torch.FloatTensor(X_train_cont)
y_train_tensor = torch.LongTensor(y_train)

X_test_cat_tensor = torch.LongTensor(X_test_cat)
X_test_cont_tensor = torch.FloatTensor(X_test_cont)
y_test_tensor = torch.LongTensor(y_test)

In [9]:
# Create TensorDataset and DataLoader
train_dataset = TensorDataset(X_train_cat_tensor, X_train_cont_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_cat_tensor, X_test_cont_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)


**--- Model Design ---**

In [10]:


class TabularModel(nn.Module):
    def __init__(self, categorical_dims, continuous_input_dim, output_dim, hidden_dim, p):
        super(TabularModel, self).__init__()
        self.all_cat_dims = categorical_dims
        self.embedding_layers = nn.ModuleList([
            nn.Embedding(num_embeddings=dim, embedding_dim=min(50, (dim + 1) // 2))
            for col, dim in categorical_dims.items()
        ])
        
        # Calculate total embedding dimension
        self.cat_embed_dim = sum(e.embedding_dim for e in self.embedding_layers)
        
        self.batch_norm = nn.BatchNorm1d(continuous_input_dim)
        
        # Define the layers
        self.fc1 = nn.Linear(self.cat_embed_dim + continuous_input_dim, hidden_dim)
        self.dropout = nn.Dropout(p)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x_cat, x_cont):
        embeddings = [self.embedding_layers[i](x_cat[:, i]) for i in range(x_cat.shape[1])]
        if len(embeddings) > 0:
            x_cat_embed = torch.cat(embeddings, 1)
        else:
            x_cat_embed = torch.empty(x_cont.shape[0], 0, device=x_cont.device)

        x_cont = self.batch_norm(x_cont)
        
        x = torch.cat([x_cat_embed, x_cont], 1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.output_layer(x)
        
        return x

In [11]:


# Instantiate model
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabularModel(
    categorical_dims=categorical_dims,
    continuous_input_dim=len(continuous_cols),
    output_dim=len(le_income.classes_),
    hidden_dim=50,
    p=0.4
).to(device)


**--- Training and Evaluation ---**

In [12]:

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 300

In [13]:

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for cat_data, cont_data, labels in train_loader:
        cat_data, cont_data, labels = cat_data.to(device), cont_data.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(cat_data, cont_data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if (epoch + 1) % 50 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

Epoch [50/300], Loss: 0.3410
Epoch [100/300], Loss: 0.3347
Epoch [150/300], Loss: 0.3320
Epoch [200/300], Loss: 0.3289
Epoch [250/300], Loss: 0.3282
Epoch [300/300], Loss: 0.3263


In [14]:
model.eval()
correct = 0
total = 0
test_loss = 0.0
with torch.no_grad():
    for cat_data, cont_data, labels in test_loader:
        cat_data, cont_data, labels = cat_data.to(device), cont_data.to(device), labels.to(device)
        outputs = model(cat_data, cont_data)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_loss /= len(test_loader)
accuracy = 100 * correct / total
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {accuracy:.2f}%")

Test Loss: 0.3569
Test Accuracy: 83.42%



**--- BONUS: Prediction Function ---**

In [15]:

def predict_income(model, data, categorical_dims, continuous_cols, le_income, scaler):
    model.eval()
    
    input_df = pd.DataFrame([data])
    
    for col in categorical_dims.keys():
        le = LabelEncoder()
        le.fit(df[col]) 
        input_df[col] = le.transform(input_df[col])
        
    # Scale continuous features
    input_cont = scaler.transform(input_df[continuous_cols])
    
    # Convert to tensors
    input_cat_tensor = torch.LongTensor(input_df[categorical_dims.keys()].values)
    input_cont_tensor = torch.FloatTensor(input_cont)
    
    # Make prediction
    with torch.no_grad():
        output = model(input_cat_tensor, input_cont_tensor)
        _, predicted_class = torch.max(output.data, 1)
    
    # Inverse transform the prediction to get the original label
    prediction = le_income.inverse_transform(predicted_class.cpu().numpy())[0]
    
    return prediction

