In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# === 0. Install/Import ===
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# === 1. Setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# === 2. Dataset Class ===
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.tokenizer = tokenizer
        self.labels = labels
        self.encodings = tokenizer(
            texts, truncation=True, padding=True,
            max_length=max_length, return_tensors="pt"
        )
        if labels is not None:
            self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return self.encodings['input_ids'].size(0)

# === 3. Load & Clean Data ===
# --- Source ---
source_df = pd.read_csv("/content/drive/MyDrive/Diabetes_cleaned.csv").dropna()
print("Source label distribution:")
print(source_df['label'].value_counts())
train_df, val_df = train_test_split(source_df, test_size=0.15, stratify=source_df['label'], random_state=42)

train_texts = train_df['content'].tolist()
train_labels = train_df['label'].astype(int).tolist()
val_texts = val_df['content'].tolist()
val_labels = val_df['label'].astype(int).tolist()

# --- Target ---
target_df = pd.read_csv("/content/drive/MyDrive/Corrected_Labeled.csv", encoding='ISO-8859-1').dropna()
target_df.columns = target_df.columns.str.strip().str.lower()
text_col = [col for col in target_df.columns if "text" in col][0]
target_texts = [t for t in target_df[text_col] if isinstance(t, str) and t.strip()]
print(f"Using target column: {text_col} | Total: {len(target_texts)}")

# === 4. Datasets & Loaders ===
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
target_dataset = TextDataset(target_texts, None, tokenizer)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
sample_weights = [class_weights[label] for label in train_labels]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=8, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=8)
target_loader = DataLoader(target_dataset, batch_size=8, shuffle=True)

# === 5. Reverse Gradient Layer ===
class ReverseLayerF(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)
    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.neg() * ctx.alpha, None

# === 6. DANN Model ===
class DANNModel(nn.Module):
    def __init__(self, num_classes=2, hidden_size=768):
        super(DANNModel, self).__init__()
        self.encoder = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
        self.class_classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )
        self.domain_classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 2)
        )

    def forward(self, input_ids, attention_mask, alpha=1.0):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        reverse_feature = ReverseLayerF.apply(pooled_output, alpha)
        class_logits = self.class_classifier(pooled_output)
        domain_logits = self.domain_classifier(reverse_feature)
        return class_logits, domain_logits

model = DANNModel().to(device)
torch.save(model.encoder.state_dict(), "/content/drive/MyDrive/biobert_encoder_raw_init.pt")

# === 7. Optimizer & Loss ===
task_criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(device))
domain_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# === 8. Evaluation Function ===
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attn_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits, _ = model(input_ids, attn_mask)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds, digits=4))
    acc = np.mean(np.array(all_preds) == np.array(all_labels))
    return acc

# === 9. Training Loop ===
best_val_acc = 0
patience = 2
no_improve = 0

for epoch in range(10):
    model.train()
    total_task_loss, total_domain_loss = 0, 0

    for i, (source_batch, target_batch) in enumerate(zip(train_loader, target_loader)):
        optimizer.zero_grad()

        p = float(i + epoch * len(train_loader)) / (10 * len(train_loader))
        alpha = 2. / (1. + np.exp(-10 * p)) - 1

        # Source
        src_input_ids = source_batch['input_ids'].to(device)
        src_attn_mask = source_batch['attention_mask'].to(device)
        src_labels = source_batch['labels'].to(device)

        # Target
        tgt_input_ids = target_batch['input_ids'].to(device)
        tgt_attn_mask = target_batch['attention_mask'].to(device)

        class_logits, domain_logits_src = model(src_input_ids, src_attn_mask, alpha=alpha)
        _, domain_logits_tgt = model(tgt_input_ids, tgt_attn_mask, alpha=alpha)

        task_loss = task_criterion(class_logits, src_labels)
        domain_labels_src = torch.zeros(len(src_labels), dtype=torch.long).to(device)
        domain_labels_tgt = torch.ones(len(tgt_input_ids), dtype=torch.long).to(device)
        domain_logits = torch.cat([domain_logits_src, domain_logits_tgt], dim=0)
        domain_labels = torch.cat([domain_labels_src, domain_labels_tgt], dim=0)
        domain_loss = domain_criterion(domain_logits, domain_labels)

        total_loss = task_loss + domain_loss
        total_loss.backward()
        optimizer.step()

        total_task_loss += task_loss.item()
        total_domain_loss += domain_loss.item()

    print(f"\nEpoch {epoch+1} | Task Loss: {total_task_loss/len(train_loader):.4f} | Domain Loss: {total_domain_loss/len(train_loader):.4f}")
    val_acc = evaluate(model, val_loader)
    print(f"\u2192 Val Accuracy: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "/content/drive/MyDrive/encoder_dann_raw_init.pt")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping triggered.")
            break


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Source label distribution:
label
0    1661
1     608
Name: count, dtype: int64
✅ Using target column: text | Total: 902


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]


Epoch 1 | Task Loss: 0.1672 | Domain Loss: 0.3303
              precision    recall  f1-score   support

           0     0.9790    0.9320    0.9549       250
           1     0.8350    0.9451    0.8866        91

    accuracy                         0.9355       341
   macro avg     0.9070    0.9385    0.9208       341
weighted avg     0.9406    0.9355    0.9367       341

→ Val Accuracy: 0.9355

Epoch 2 | Task Loss: 0.0887 | Domain Loss: 0.9778
              precision    recall  f1-score   support

           0     0.9915    0.9280    0.9587       250
           1     0.8318    0.9780    0.8990        91

    accuracy                         0.9413       341
   macro avg     0.9116    0.9530    0.9288       341
weighted avg     0.9488    0.9413    0.9427       341

→ Val Accuracy: 0.9413

Epoch 3 | Task Loss: 0.0669 | Domain Loss: 0.7881
              precision    recall  f1-score   support

           0     0.9574    0.9880    0.9724       250
           1     0.9639    0.8791    0

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader

# === Setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# === Dataset Class ===
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# === Load Informal Data ===
df = pd.read_csv("/content/drive/MyDrive/DiaBERT Corrected_Labeled.csv", encoding='ISO-8859-1')
df.columns = df.columns.str.strip().str.lower()

print("Columns:", df.columns)
if "text " in df.columns:
    df = df.rename(columns={"text ": "text"})
elif "text" not in df.columns:
    raise ValueError("Expected column 'text' or 'text ' not found")

# === Clean and map labels ===
df = df.dropna(subset=["text", "label"])
print("Unique raw labels before mapping:", df["label"].unique())

label_map = {
    'true': 0,
    'false': 1,
    'partially true': 2,
    'partiallytrue': 2,
    'parially true': 2,
    'partially  true': 2
}
df["label"] = df["label"].astype(str).str.strip().str.lower().map(label_map)
df = df.dropna(subset=["label"])  # remove any still unmapped labels

print("Mapped labels:", df["label"].unique())
print("Final dataset shape:", df.shape)

texts = df["text"].tolist()
labels = df["label"].astype(int).tolist()

# === Train/Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

# === Class Weights for Imbalance
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# === Dataloaders
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# === Classification Head on Top of DANN Encoder ===
class ClassifierOnly(nn.Module):
    def __init__(self, encoder, hidden_size=768, num_classes=3):
        super(ClassifierOnly, self).__init__()
        self.encoder = encoder  # full encoder trainable
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(pooled_output)

# === Load Encoder from Stage 2 DANN
dann_encoder = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
dann_encoder.load_state_dict(torch.load("/content/drive/MyDrive/biobert_dann_raw_init.pt"), strict=False)

# === Build Model
model = ClassifierOnly(dann_encoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

# === Train Classifier ===
for epoch in range(10):  # train longer if needed
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attn_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}: Avg Training Loss = {total_loss / len(train_loader):.4f}")

# === Evaluate ===
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        logits = model(input_ids, attn_mask)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("=== Evaluation on Informal Test Set ===")
print(classification_report(all_labels, all_preds, digits=4))


Columns: Index(['text', 'label', 'unnamed: 2', 'unnamed: 3', 'unnamed: 4', 'unnamed: 5',
       'unnamed: 6', 'unnamed: 7', 'unnamed: 8', 'unnamed: 9'],
      dtype='object')
Unique raw labels before mapping: ['TRUE' 'FALSE' 'Partially True' 'partially true' 'Parially true'
 'Partially true']
Mapped labels: [0 1 2]
Final dataset shape: (894, 10)
Epoch 1: Avg Training Loss = 1.0204
Epoch 2: Avg Training Loss = 0.7133
Epoch 3: Avg Training Loss = 0.4438
Epoch 4: Avg Training Loss = 0.2366
Epoch 5: Avg Training Loss = 0.0876
Epoch 6: Avg Training Loss = 0.0479
Epoch 7: Avg Training Loss = 0.0486
Epoch 8: Avg Training Loss = 0.0423
Epoch 9: Avg Training Loss = 0.0285
Epoch 10: Avg Training Loss = 0.0124
=== Evaluation on Informal Test Set ===
              precision    recall  f1-score   support

           0     0.7597    0.8750    0.8133       112
           1     0.7727    0.4722    0.5862        36
           2     0.5714    0.5161    0.5424        31

    accuracy                     