In [6]:
# config.py

config = {
    "NUMERIC_FEATURE_NAMES": ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"],
    "CATEGORICAL_FEATURE_NAMES": ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"],
    "TARGET_FEATURE_NAME": "income_bracket",
    "TARGET_LABELS": [" <=50K", " >50K"],
    "EMBEDDING_DIMS": 16,
    "NUM_TRANSFORMER_BLOCKS": 3,
    "NUM_HEADS": 4,
    "DROPOUT_RATE": 0.2,
    "MLP_HIDDEN_UNITS_FACTORS": [2, 1],
    "LEARNING_RATE": 0.001,
    "WEIGHT_DECAY": 0.0001,
    "BATCH_SIZE": 265,
    "NUM_EPOCHS": 15,
    # "TRAIN_DATA_URL": "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    # "TEST_DATA_URL": "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
    # "CSV_HEADER": ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"]
}


In [9]:
# data_preparation.py

import pandas as pd
import numpy as np

def generate_random_data(num_samples=5000):
    np.random.seed(42)  # For reproducibility
    
    # Generating random numerical data
    age = np.random.randint(18, 70, num_samples)
    education_num = np.random.randint(1, 16, num_samples)
    capital_gain = np.random.randint(0, 10000, num_samples)
    capital_loss = np.random.randint(0, 5000, num_samples)
    hours_per_week = np.random.randint(1, 99, num_samples)
    
    # Generating random categorical data
    workclass = np.random.choice(['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'], num_samples)
    education = np.random.choice(['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'], num_samples)
    marital_status = np.random.choice(['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'], num_samples)
    occupation = np.random.choice(['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'], num_samples)
    relationship = np.random.choice(['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'], num_samples)
    race = np.random.choice(['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'], num_samples)
    gender = np.random.choice(['Male', 'Female'], num_samples)
    native_country = np.random.choice(['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'], num_samples)
    
    # Generating random target variable
    income_bracket = np.random.choice([' <=50K', ' >50K'], num_samples)
    
    # Combining all features into a DataFrame
    data = pd.DataFrame({
        'age': age,
        'workclass': workclass,
        'education_num': education_num,
        'education': education,
        'marital_status': marital_status,
        'occupation': occupation,
        'relationship': relationship,
        'race': race,
        'gender': gender,
        'capital_gain': capital_gain,
        'capital_loss': capital_loss,
        'hours_per_week': hours_per_week,
        'native_country': native_country,
        'income_bracket': income_bracket
    })
    
    # Splitting the data into train and test sets
    train_data = data.sample(frac=0.8, random_state=42)
    test_data = data.drop(train_data.index)
    
    # Saving to CSV files
    train_data.to_csv("train_data.csv", index=False, header=False)
    test_data.to_csv("test_data.csv", index=False, header=False)
    
    return train_data, test_data

if __name__ == "__main__":
    train_data, test_data = generate_random_data()
    print(f"Train dataset shape: {train_data.shape}")
    print(f"Test dataset shape: {test_data.shape}")


Train dataset shape: (4000, 14)
Test dataset shape: (1000, 14)


In [13]:
# train_data.to_csv('train')

In [21]:
train_data

Unnamed: 0,age,workclass,education_num,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
1501,59,Self-emp-inc,12,9th,Divorced,Exec-managerial,Other-relative,Black,Male,5985,869,86,France,<=50K
2586,25,Local-gov,9,9th,Divorced,Machine-op-inspct,Wife,Black,Female,9447,625,5,Cambodia,<=50K
2653,38,Private,13,1st-4th,Widowed,Exec-managerial,Unmarried,Other,Female,1550,2555,65,Portugal,<=50K
1055,48,Federal-gov,7,11th,Never-married,Machine-op-inspct,Other-relative,Other,Male,7001,1240,92,Honduras,>50K
705,53,Local-gov,11,9th,Married-spouse-absent,Transport-moving,Own-child,Black,Female,7420,3793,62,India,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3335,55,State-gov,2,Some-college,Married-spouse-absent,Handlers-cleaners,Unmarried,White,Female,7098,4722,3,Haiti,>50K
1920,53,Without-pay,3,7th-8th,Never-married,Craft-repair,Husband,Amer-Indian-Eskimo,Male,1845,3938,88,Laos,>50K
3715,43,Local-gov,14,Preschool,Married-spouse-absent,Priv-house-serv,Wife,Black,Female,2180,711,94,Vietnam,>50K
4646,68,Self-emp-inc,7,12th,Married-spouse-absent,Armed-Forces,Own-child,Amer-Indian-Eskimo,Male,8708,18,55,Italy,>50K


In [17]:
# tabtransformer.py

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
# from config import config

class TabularDataset(Dataset):
    def __init__(self, data, numerical_features, categorical_features, target):
        self.numerical_data = data[numerical_features].values.astype(np.float32)
        self.categorical_data = data[categorical_features].apply(lambda x: x.astype('category').cat.codes).values
        self.labels = LabelEncoder().fit_transform(data[target])
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'numerical_data': torch.tensor(self.numerical_data[idx]),
            'categorical_data': torch.tensor(self.categorical_data[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

In [18]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, num_heads, dropout):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, num_heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, 2048),
            nn.ReLU(),
            nn.Linear(2048, embed_size)
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        attention_output, _ = self.attention(x, x, x)
        x = self.norm1(x + self.dropout(attention_output))
        feed_forward_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(feed_forward_output))
        return x

In [19]:
class TabTransformer(nn.Module):
    def __init__(self, num_categories, num_numerical, embed_size, num_heads, num_blocks, dropout, mlp_hidden_units_factors):
        super(TabTransformer, self).__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, embed_size) for categories in num_categories])
        self.transformer_blocks = nn.ModuleList([TransformerBlock(embed_size, num_heads, dropout) for _ in range(num_blocks)])
        self.mlp_input_size = len(num_categories) * embed_size + num_numerical
        mlp_hidden_units = [factor * self.mlp_input_size for factor in mlp_hidden_units_factors]
        self.mlp = self._create_mlp(mlp_hidden_units, dropout)
        self.output_layer = nn.Linear(mlp_hidden_units[-1], 1)
        
    def _create_mlp(self, hidden_units, dropout):
        layers = []
        for units in hidden_units:
            layers.append(nn.Linear(self.mlp_input_size, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            self.mlp_input_size = units
        return nn.Sequential(*layers)
        
    def forward(self, numerical_data, categorical_data):
        embeddings = [embed(categorical_data[:, i]) for i, embed in enumerate(self.embeddings)]
        x = torch.stack(embeddings, dim=1)  # Stack along the new dimension
        x = x.permute(1, 0, 2)  # Permute to (sequence_length, batch_size, embed_dim)
        for block in self.transformer_blocks:
            x = block(x)
        x = x.permute(1, 0, 2).contiguous().view(x.size(1), -1)  # Reshape back and align the batch size
        x = torch.cat([x, numerical_data], dim=1)
        x = self.mlp(x)
        x = self.output_layer(x)
        return torch.sigmoid(x)

In [20]:

def train_model(model, train_loader, val_loader, epochs, lr, weight_decay):
    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            numerical_data = batch['numerical_data']
            categorical_data = batch['categorical_data']
            labels = batch['label'].unsqueeze(1).float()
            
            outputs = model(numerical_data, categorical_data)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        val_loss = 0
        val_acc = 0
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                numerical_data = batch['numerical_data']
                categorical_data = batch['categorical_data']
                labels = batch['label'].unsqueeze(1).float()
                
                outputs = model(numerical_data, categorical_data)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                predictions = (outputs > 0.5).float()
                val_acc += (predictions == labels).float().mean()
        
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_acc/len(val_loader):.4f}')

def main():
    # Generate random dataset
    train_data, test_data = generate_random_data()
    
    # Prepare datasets
    train_dataset = TabularDataset(train_data, config["NUMERIC_FEATURE_NAMES"], config["CATEGORICAL_FEATURE_NAMES"], config["TARGET_FEATURE_NAME"])
    test_dataset = TabularDataset(test_data, config["NUMERIC_FEATURE_NAMES"], config["CATEGORICAL_FEATURE_NAMES"], config["TARGET_FEATURE_NAME"])
    
    train_loader = DataLoader(train_dataset, batch_size=config["BATCH_SIZE"], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=config["BATCH_SIZE"], shuffle=False)
    
    # Determine number of unique categories for each categorical feature
    num_categories = [len(train_data[feature].unique()) for feature in config["CATEGORICAL_FEATURE_NAMES"]]
    
    # Initialize the model
    model = TabTransformer(num_categories, len(config["NUMERIC_FEATURE_NAMES"]), config["EMBEDDING_DIMS"], config["NUM_HEADS"], config["NUM_TRANSFORMER_BLOCKS"], config["DROPOUT_RATE"], config["MLP_HIDDEN_UNITS_FACTORS"])
    
    # Train the model
    train_model(model, train_loader, test_loader, config["NUM_EPOCHS"], config["LEARNING_RATE"], config["WEIGHT_DECAY"])

if __name__ == "__main__":
    main()


Epoch 1/15, Loss: 44.3562, Val Loss: 45.3981, Val Acc: 0.5064
Epoch 2/15, Loss: 33.1119, Val Loss: 34.9974, Val Acc: 0.5064
Epoch 3/15, Loss: 18.1953, Val Loss: 19.0717, Val Acc: 0.5062
Epoch 4/15, Loss: 1.0594, Val Loss: 1.1065, Val Acc: 0.5070
Epoch 5/15, Loss: 0.6983, Val Loss: 0.6985, Val Acc: 0.5018
Epoch 6/15, Loss: 0.6939, Val Loss: 0.6935, Val Acc: 0.4988
Epoch 7/15, Loss: 0.6941, Val Loss: 0.6927, Val Acc: 0.5051
Epoch 8/15, Loss: 0.6937, Val Loss: 0.6928, Val Acc: 0.5066
Epoch 9/15, Loss: 0.6932, Val Loss: 0.6930, Val Acc: 0.5074
Epoch 10/15, Loss: 0.6932, Val Loss: 0.6930, Val Acc: 0.5084
Epoch 11/15, Loss: 0.6932, Val Loss: 0.6929, Val Acc: 0.5062
Epoch 12/15, Loss: 0.6937, Val Loss: 0.6928, Val Acc: 0.5075
Epoch 13/15, Loss: 0.6937, Val Loss: 0.6928, Val Acc: 0.5104
Epoch 14/15, Loss: 0.6933, Val Loss: 0.6929, Val Acc: 0.5074
Epoch 15/15, Loss: 0.6930, Val Loss: 0.6930, Val Acc: 0.5117
