## Improved MLP

### Upsample Bearish and Bullish to match Neutral

In [None]:
# Convert embeddings to DataFrame for easier handling
X_train_df = pd.DataFrame(train_embeddings.numpy())
X_train_df['label'] = train_df['label'].values

# Separate by class
df_bearish = X_train_df[X_train_df['label'] == 0]
df_bullish = X_train_df[X_train_df['label'] == 1]
df_neutral = X_train_df[X_train_df['label'] == 2]

# Upsample classes 0 and 1
df_bearish_up = resample(df_bearish, replace=True, n_samples=len(df_neutral), random_state=42)
df_bullish_up = resample(df_bullish, replace=True, n_samples=len(df_neutral), random_state=42)

# Combine and shuffle
df_upsampled = pd.concat([df_bearish_up, df_bullish_up, df_neutral])
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract features and labels
X_balanced = torch.tensor(df_upsampled.drop(columns='label').values, dtype=torch.float)
y_balanced = torch.tensor(df_upsampled['label'].values, dtype=torch.long)

### Define the PyTorch MLP with Dropout & Tuning and Prepare Data Loaders

In [None]:
import torch.nn as nn

class TunedMLP(nn.Module):
    def __init__(self, input_dim=768, hidden_dims=(512, 256), num_classes=3, dropout_rate=0.4):
        super(TunedMLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dims[1], num_classes)
        )

    def forward(self, x):
        return self.model(x)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 64

train_loader = DataLoader(TensorDataset(X_balanced, y_balanced), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(val_embeddings, torch.tensor(val_df['label'].values)), batch_size=batch_size)


### Train the Model with Adam Optimizer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TunedMLP().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss:.4f}")


Epoch 1/10, Loss: 221.3966
Epoch 2/10, Loss: 191.7300
Epoch 3/10, Loss: 179.9951
Epoch 4/10, Loss: 170.8958
Epoch 5/10, Loss: 164.7649
Epoch 6/10, Loss: 155.9724
Epoch 7/10, Loss: 147.0735
Epoch 8/10, Loss: 140.2346
Epoch 9/10, Loss: 131.9610
Epoch 10/10, Loss: 124.2431


### Evaluate on Validation Set

In [None]:
model.eval()
val_preds = []

with torch.no_grad():
    for xb, _ in val_loader:
        xb = xb.to(device)
        logits = model(xb)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        val_preds.extend(preds)

from sklearn.metrics import classification_report
print(classification_report(val_df['label'], val_preds, target_names=["Bearish", "Bullish", "Neutral"]))


              precision    recall  f1-score   support

     Bearish       0.47      0.59      0.52       288
     Bullish       0.51      0.64      0.57       385
     Neutral       0.85      0.74      0.79      1236

    accuracy                           0.69      1909
   macro avg       0.61      0.65      0.63      1909
weighted avg       0.73      0.69      0.70      1909



In [None]:
# !pip install transformers datasets
# !pip install --upgrade transformers


In [None]:
# Use train_df only 
df_full = train_df.copy()
df_full['text'] = df_full['text'].astype(str)

# Define label names and cast to ClassLabel
label_names = ["Bearish", "Bullish", "Neutral"]
label_feature = ClassLabel(num_classes=3, names=label_names)

# Build Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df_full[['text', 'label']])
hf_dataset = hf_dataset.cast_column("label", label_feature)

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=64)

# Train-test split with stratification
hf_dataset = hf_dataset.train_test_split(test_size=0.2, stratify_by_column='label')
hf_dataset = hf_dataset.map(tokenize, batched=True)
hf_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Casting the dataset:   0%|          | 0/7634 [00:00<?, ? examples/s]

Map:   0%|          | 0/6107 [00:00<?, ? examples/s]

Map:   0%|          | 0/1527 [00:00<?, ? examples/s]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report


In [None]:
# Define the model
class MLPClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dims=(512, 256), dropout=0.4, num_classes=3):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[1], num_classes)
        )

    def forward(self, x):
        return self.model(x)


In [None]:
# Prepare data loaders
batch_size = 64

train_dataset = TensorDataset(train_embeddings, torch.tensor(train_df['label'].values))
val_dataset = TensorDataset(val_embeddings, torch.tensor(val_df['label'].values))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [None]:
# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MLPClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f}")


Epoch 1/10 | Loss: 101.1747
Epoch 2/10 | Loss: 89.9045
Epoch 3/10 | Loss: 85.5720
Epoch 4/10 | Loss: 82.6672
Epoch 5/10 | Loss: 81.1423
Epoch 6/10 | Loss: 79.7502
Epoch 7/10 | Loss: 78.6840
Epoch 8/10 | Loss: 75.5244
Epoch 9/10 | Loss: 74.9211
Epoch 10/10 | Loss: 73.4981


In [None]:
# Evaluate on validation set
model.eval()
val_preds = []

with torch.no_grad():
    for xb, _ in val_loader:
        xb = xb.to(device)
        logits = model(xb)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        val_preds.extend(preds)

print(classification_report(val_df['label'], val_preds, target_names=["Bearish", "Bullish", "Neutral"]))


              precision    recall  f1-score   support

     Bearish       0.55      0.51      0.53       288
     Bullish       0.63      0.41      0.49       385
     Neutral       0.78      0.89      0.83      1236

    accuracy                           0.73      1909
   macro avg       0.66      0.60      0.62      1909
weighted avg       0.72      0.73      0.72      1909

