In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import DebertaForSequenceClassification, AutoTokenizer, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import re

In [16]:
# CLEAN THE DATA
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [17]:
# LOAD LABELED DATA
df = pd.read_csv("/kaggle/input/fb-post-classification/FB_posts_labeled.txt", sep="\t")

def get_label(row):
    if row["Appreciation"] == 1:
        return "Appreciation"
    elif row["Complaint"] == 1:
        return "Complaint"
    else:
        return "Feedback"

df["label"] = df.apply(get_label, axis=1)
df["clean_text"] = df["message"].astype(str).map(clean_text)

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df["label"])

In [18]:
# TRAIN/VAL SPLIT
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_text"].tolist(), encoded_labels, test_size=0.2, stratify=encoded_labels, random_state=452
)

model_name = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_tokens = tokenizer(train_texts, truncation=True, padding=True, max_length=64, return_tensors="pt")
val_tokens   = tokenizer(val_texts,   truncation=True, padding=True, max_length=64, return_tensors="pt")

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

In [19]:
# DATASET
class FBCommentDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokens["input_ids"][idx],
            "attention_mask": self.tokens["attention_mask"][idx],
            "labels": self.labels[idx],
        }

train_dataset = FBCommentDataset(train_tokens, train_labels)
val_dataset   = FBCommentDataset(val_tokens, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)

In [20]:
# MODEL SETUP
model = DebertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
num_epochs = 3
num_training_steps = len(train_loader) * num_epochs

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# TRAINING LOOP
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} complete. Avg loss: {avg_loss:.4f}")

    # VALIDATION EVALUATION
    model.eval()
    val_preds = []
    val_trues = []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            val_preds.extend(preds.cpu().numpy())
            val_trues.extend(batch["labels"].cpu().numpy())

    f1 = f1_score(val_trues, val_preds, average="macro")
    print(f"Validation F1 Score (macro): {f1:.4f}")

Epoch 1 complete. Avg loss: 0.4569
Validation F1 Score (macro): 0.8670
Epoch 2 complete. Avg loss: 0.2180
Validation F1 Score (macro): 0.8635
Epoch 3 complete. Avg loss: 0.0910
Validation F1 Score (macro): 0.8744


In [23]:
# EVALUATE ON UNLABELED DATA

# Define new dataset for unlabeled inputs
class UnlabeledDataset(Dataset):
    def __init__(self, tokens):
        self.tokens = tokens

    def __len__(self):
        return self.tokens["input_ids"].shape[0]

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokens["input_ids"][idx],
            "attention_mask": self.tokens["attention_mask"][idx]
        }

# Tokenize
unlabeled_df = pd.read_csv("/kaggle/input/fb-post-classification/FB_posts_unlabeled.txt", sep="\t")
unlabeled_df["clean_text"] = unlabeled_df["message"].astype(str).map(clean_text)

unlabeled_tokens = tokenizer(
    unlabeled_df["clean_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=64,
    return_tensors="pt"
)

# Create DataLoader
unlabeled_dataset = UnlabeledDataset(unlabeled_tokens)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=32)

# Run inference in batches
model.eval()
all_preds = []

with torch.no_grad():
    for batch in unlabeled_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

# One-hot encode and save
one_hot = torch.nn.functional.one_hot(torch.tensor(all_preds), num_classes=3).numpy()

pred_df = pd.DataFrame({
    "postId": unlabeled_df["postId"],
    "Appreciation_pred": one_hot[:, 0],
    "Complaint_pred":    one_hot[:, 1],
    "Feedback_pred":     one_hot[:, 2]
})

pred_df.to_csv("predictions.csv", index=False)
print(f"Wrote predictions.csv with {len(pred_df)} rows.")

Wrote predictions.csv with 2039 rows.
