**BioBERT+CORAL**

In [None]:
pip install transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/Cleaned_Tweets_Standardized.xlsx"
df = pd.read_excel(file_path)


In [None]:
df.head()

Unnamed: 0,Title,Text (Post),Label,Date
0,Facebook Group description,\nExercise regularly to control blood sugar\n\n,True,2023-12-13
1,,Manage your carb intake reduces the risk of di...,True,NaT
2,,Eat more fiber to reduce blood sugar levels,True,NaT
3,,\nDrink water and stay hydrated helps to stay ...,True,NaT
4,How to live diabetes free life?,Lose extra weight. Losing weight reduces the r...,True,2023-12-18


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoConfig, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Ensure device compatibility
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------------
# Load and Process Dataset
# ------------------------------
file_path = "/mnt/data/Cleaned_Tweets.xlsx"
df = pd.read_excel(file_path)

# Drop missing values
df_cleaned = df.dropna(subset=["Text (Post)", "Label"]).copy()

# Convert labels to numerical values
label_mapping = {"true": 0, "false": 1, "exaggerated": 2, "misconstrued": 3}
df_cleaned.loc[:, "Label_ID"] = df_cleaned["Label"].map(label_mapping)

# Split into train, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df_cleaned["Text (Post)"], df_cleaned["Label_ID"], test_size=0.3, random_state=42, stratify=df_cleaned["Label_ID"]
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

# Define model name
model_name = "dmis-lab/biobert-base-cased-v1.1"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ------------------------------
# Define Dataset Class
# ------------------------------
class MisinformationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create dataset instances
train_dataset = MisinformationDataset(train_texts, train_labels, tokenizer)
val_dataset = MisinformationDataset(val_texts, val_labels, tokenizer)
test_dataset = MisinformationDataset(test_texts, test_labels, tokenizer)

# ------------------------------
# Define CORAL Loss Function
# ------------------------------
def coral_loss(source_features, target_features):
    """CORAL loss function to align domain distributions."""
    source_mean = torch.mean(source_features, dim=0)
    target_mean = torch.mean(target_features, dim=0)

    source_cov = (source_features - source_mean).T @ (source_features - source_mean) / (source_features.shape[0] - 1)
    target_cov = (target_features - target_mean).T @ (target_features - target_mean) / (target_features.shape[0] - 1)

    loss = torch.norm(source_cov - target_cov, p='fro') + torch.norm(source_mean - target_mean, p=2)
    return loss

# ------------------------------
# Define BioBERT with CORAL
# ------------------------------
class BioBERT_Coral(nn.Module):
    def __init__(self, base_model_name, num_labels=4):
        super(BioBERT_Coral, self).__init__()
        self.config = AutoConfig.from_pretrained(base_model_name, num_labels=num_labels)
        self.feature_extractor = AutoModel.from_pretrained(base_model_name, config=self.config)
        self.classifier = nn.Linear(self.feature_extractor.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None, target_features=None):
        outputs = self.feature_extractor(input_ids=input_ids, attention_mask=attention_mask)

        # Ensure feature tensor is contiguous
        features = outputs.last_hidden_state[:, 0, :].contiguous()

        logits = self.classifier(features)

        loss = None
        if labels is not None:
            labels = labels.long()
            classification_loss = nn.CrossEntropyLoss()(logits, labels)

            if target_features is not None:
                target_features = target_features.contiguous()
                alignment_loss = coral_loss(features, target_features)
                loss = classification_loss + alignment_loss
            else:
                loss = classification_loss

        return {"loss": loss, "logits": logits}

# ------------------------------
# Load Model & Define Trainer
# ------------------------------
model = BioBERT_Coral(model_name, num_labels=len(set(label_mapping.values()))).to(device)

training_args = TrainingArguments(
    output_dir="./results_BioBERT_CORAL",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs_BioBERT_CORAL",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
    save_total_limit=2,
    save_safetensors=False
)

# ------------------------------
# Define Metrics for Evaluation
# ------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1).numpy()
    labels = labels.numpy()
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ------------------------------
# Train Model
# ------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# ------------------------------
# Save Trained Model
# ------------------------------
torch.save(model.state_dict(), "./content/drive/MyDrive/BioBERT_CORAL_model.pth")
tokenizer.save_pretrained("./content/drive/MyDrive/BioBERT_CORAL_model")

# ------------------------------
# Final Test Evaluation
# ------------------------------
print("Evaluating on Test Set...")
test_results = trainer.evaluate(test_dataset)
print(f"Final Evaluation Results on Test Set:\n{test_results}")


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
module 'sympy.printing' has no attribute 'str'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U transformers




In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess
formal_df = pd.read_csv("/content/drive/MyDrive/Diabetes_cleaned.csv").dropna(subset=["content", "label"])
informal_df = pd.read_csv("/content/drive/MyDrive/Corrected_Labeled.csv", encoding="ISO-8859-1").dropna(subset=["Text ", "Label"])
formal_df = formal_df.rename(columns={"content": "text", "label": "label"})
informal_df = informal_df.rename(columns={"Text ": "text", "Label": "label"})
formal_df["label"] = formal_df["label"].astype(int)
informal_df["label"] = informal_df["label"].astype(int)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

# Split datasets
source_train, _ = train_test_split(formal_df, test_size=0.15, stratify=formal_df["label"], random_state=42)
target_train, target_val = train_test_split(informal_df, test_size=0.15, stratify=informal_df["label"], random_state=42)

train_dataset = TextDataset(source_train["text"], source_train["label"], tokenizer)
target_dataset = TextDataset(target_train["text"], target_train["label"], tokenizer)
val_dataset = TextDataset(target_val["text"], target_val["label"], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
target_loader = DataLoader(target_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# CORAL loss
def coral_loss(source, target):
    d = source.size(1)
    source_mean = source.mean(0)
    target_mean = target.mean(0)
    source_cov = (source - source_mean).T @ (source - source_mean) / (source.size(0) - 1)
    target_cov = (target - target_mean).T @ (target - target_mean) / (target.size(0) - 1)
    return (torch.norm(source_cov - target_cov, p="fro")**2 + torch.norm(source_mean - target_mean, p=2)**2) / (4 * d**2)

# Model
class BioBERTWithCORAL(nn.Module):
    def __init__(self, base_model, num_labels=3):
        super().__init__()
        self.bert = AutoModel.from_pretrained(base_model)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        features = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(self.dropout(features))
        return logits, features

model = BioBERTWithCORAL("dmis-lab/biobert-base-cased-v1.1").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
best_val_loss = float("inf")
patience = 2
patience_counter = 0

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch, tgt in zip(train_loader, target_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits, features = model(input_ids, attention_mask)
        ce_loss = criterion(logits, labels)

        tgt_ids = tgt["input_ids"].to(device)
        tgt_mask = tgt["attention_mask"].to(device)
        with torch.no_grad():
            _, tgt_features = model(tgt_ids, tgt_mask)

        coral = coral_loss(features, tgt_features)
        loss = ce_loss + coral
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}: Train Loss = {total_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits, _ = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            val_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    print(f"→ Val Loss = {avg_val_loss:.4f}")
    print(classification_report(all_labels, all_preds, target_names=["True", "False", "Partially True"]))

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "/content/drive/MyDrive/BioBERT_CORAL_Final.pth")
        print("Best model saved")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break


Epoch 1: Train Loss = 40.9014
→ Val Loss = 1.2823
                precision    recall  f1-score   support

          True       0.70      0.34      0.46        87
         False       0.18      0.68      0.29        25
Partially True       0.00      0.00      0.00        24

      accuracy                           0.35       136
     macro avg       0.29      0.34      0.25       136
  weighted avg       0.48      0.35      0.35       136



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


✅ Best model saved
Epoch 2: Train Loss = 16.1715
→ Val Loss = 1.5638
                precision    recall  f1-score   support

          True       0.00      0.00      0.00        87
         False       0.19      1.00      0.31        25
Partially True       0.00      0.00      0.00        24

      accuracy                           0.18       136
     macro avg       0.06      0.33      0.10       136
  weighted avg       0.03      0.18      0.06       136



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3: Train Loss = 9.5874
→ Val Loss = 1.7321
                precision    recall  f1-score   support

          True       0.75      0.03      0.07        87
         False       0.19      1.00      0.32        25
Partially True       0.00      0.00      0.00        24

      accuracy                           0.21       136
     macro avg       0.31      0.34      0.13       136
  weighted avg       0.51      0.21      0.10       136

⏹️ Early stopping triggered


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**BioBERT+DANN**

In [None]:
!pip uninstall -y sympy transformers
!pip install --upgrade sympy transformers



Found existing installation: sympy 1.13.1
Uninstalling sympy-1.13.1:
  Successfully uninstalled sympy-1.13.1
Found existing installation: transformers 4.48.2
Uninstalling transformers-4.48.2:
  Successfully uninstalled transformers-4.48.2
Collecting sympy
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading sympy-1.13.3-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy, transformers
[31mERROR: pip's dependency resolver does not currently take int

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import EarlyStoppingCallback
from tqdm import tqdm

# Load BioBERT tokenizer & model for embedding extraction
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
biobert_model = AutoModel.from_pretrained(MODEL_NAME)

# Load dataset
file_path = "/content/drive/MyDrive/Cleaned_Tweets_Standardized.xlsx"  # Update if needed
df = pd.read_excel(file_path)

# Drop missing values
df_cleaned = df.dropna(subset=["Text (Post)", "Label"]).copy()

# Convert labels to numerical values
label_mapping = {"true": 0, "false": 1, "exaggerated": 2, "misconstrued": 3}
df_cleaned.loc[:, "Label_ID"] = df_cleaned["Label"].map(label_mapping)

# Function to extract BioBERT embeddings efficiently (Batch-wise Processing)
def get_biobert_embeddings(texts, model, tokenizer, batch_size=16):
    model.eval()
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BioBERT Embeddings"):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embeddings
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

# Extract embeddings for all texts
biobert_embeddings = get_biobert_embeddings(df_cleaned["Text (Post)"].tolist(), biobert_model, tokenizer)

# Apply K-Means clustering for domain labels
num_clusters = 2  # Number of domain labels
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_cleaned["Domain_Label"] = kmeans.fit_predict(biobert_embeddings)

# Split into train, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels, train_domains, temp_domains = train_test_split(
    df_cleaned["Text (Post)"], df_cleaned["Label_ID"], df_cleaned["Domain_Label"], test_size=0.3, random_state=42, stratify=df_cleaned["Label_ID"]
)
val_texts, test_texts, val_labels, test_labels, val_domains, test_domains = train_test_split(
    temp_texts, temp_labels, temp_domains, test_size=0.5, random_state=42, stratify=temp_labels
)

# Define Dataset Class
class MisinformationDataset(Dataset):
    def __init__(self, texts, labels, domain_labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.domain_labels = domain_labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            "domain_labels": torch.tensor(self.domain_labels[idx], dtype=torch.long)
        }

# Create Dataset Objects
train_dataset = MisinformationDataset(train_texts, train_labels, train_domains, tokenizer)
val_dataset = MisinformationDataset(val_texts, val_labels, val_domains, tokenizer)
test_dataset = MisinformationDataset(test_texts, test_labels, test_domains, tokenizer)

# Create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Gradient Reversal Layer (GRL)
class GradientReversalFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.neg() * ctx.alpha, None

class GradientReversalLayer(nn.Module):
    def __init__(self, alpha=1.0):
        super().__init__()
        self.alpha = alpha

    def forward(self, x):
        return GradientReversalFunction.apply(x, self.alpha)

# Custom BioBERT + DANN Model
class BioBERT_DANN(nn.Module):
    def __init__(self, num_labels=4, alpha=1.0):
        super(BioBERT_DANN, self).__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(0.3)
        self.misinfo_classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.domain_classifier = nn.Linear(self.bert.config.hidden_size, 2)
        self.grl = GradientReversalLayer(alpha)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        misinformation_logits = self.misinfo_classifier(self.dropout(pooled_output))
        domain_features = self.grl(pooled_output)
        domain_logits = self.domain_classifier(self.dropout(domain_features))

        return misinformation_logits, domain_logits

# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
alpha = 0.1  # Adjust domain adaptation strength
model = BioBERT_DANN(alpha=alpha).to(device)

# Define Optimizer and Loss Functions
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()
domain_loss_fn = nn.CrossEntropyLoss()

# Training Loop
def train_model(model, dataloader, optimizer, loss_fn, domain_loss_fn, device, alpha):
    model.train()
    total_loss, all_preds, all_labels = 0, [], []

    for batch in dataloader:
        input_ids, attention_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device)
        labels, domain_labels = batch["labels"].to(device), batch["domain_labels"].to(device)

        misinformation_logits, domain_logits = model(input_ids, attention_mask)

        loss_misinfo = loss_fn(misinformation_logits, labels)
        loss_domain = domain_loss_fn(domain_logits, domain_labels)
        loss = loss_misinfo + alpha * loss_domain  # Adversarial training

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)




Extracting BioBERT Embeddings: 100%|██████████| 32/32 [01:17<00:00,  2.43s/it]


In [None]:
# Function to evaluate the model on the test set
def evaluate_model(model, dataloader, loss_fn, domain_loss_fn, device, alpha):
    model.eval()
    total_loss, all_preds, all_labels = 0, [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device)
            labels, domain_labels = batch["labels"].to(device), batch["domain_labels"].to(device)

            misinformation_logits, domain_logits = model(input_ids, attention_mask)

            loss_misinfo = loss_fn(misinformation_logits, labels)
            loss_domain = domain_loss_fn(domain_logits, domain_labels)
            loss = loss_misinfo + alpha * loss_domain  # Adversarial loss

            total_loss += loss.item()
            preds = misinformation_logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="weighted")
    recall = recall_score(all_labels, all_preds, average="weighted")
    f1 = f1_score(all_labels, all_preds, average="weighted")

    return total_loss / len(dataloader), accuracy, precision, recall, f1

# Train Model
num_epochs = 20
for epoch in range(num_epochs):
    train_loss = train_model(model, train_dataloader, optimizer, loss_fn, domain_loss_fn, device, alpha)
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}")

# Save Model & Tokenizer
torch.save(model.state_dict(), "/content/drive/MyDrive/BioBERT_DANN.pth")
tokenizer.save_pretrained("/content/drive/MyDrive/BioBERT_DANN_tokenizer")

print("Model training and saving completed!")

# Evaluate on Test Set
print("Evaluating on Test Set...")
test_loss, test_acc, test_prec, test_recall, test_f1 = evaluate_model(
    model, test_dataloader, loss_fn, domain_loss_fn, device, alpha
)

print(f"Test Set Results:")
print(f"Loss: {test_loss:.4f}")
print(f"Accuracy: {test_acc:.4f}")
print(f"Precision: {test_prec:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-score: {test_f1:.4f}")


Epoch 1: Train Loss=1.2897
Epoch 2: Train Loss=1.2187
Epoch 3: Train Loss=1.1493
Epoch 4: Train Loss=1.0074
Epoch 5: Train Loss=0.9364
Epoch 6: Train Loss=0.9326
Epoch 7: Train Loss=0.8233
Epoch 8: Train Loss=0.7457
Epoch 9: Train Loss=0.6753
Epoch 10: Train Loss=0.6175
Epoch 11: Train Loss=0.5536
Epoch 12: Train Loss=0.4748
Epoch 13: Train Loss=0.4165
Epoch 14: Train Loss=0.3520
Epoch 15: Train Loss=0.3204
Epoch 16: Train Loss=0.2627
Epoch 17: Train Loss=0.2248
Epoch 18: Train Loss=0.2076
Epoch 19: Train Loss=0.1761
Epoch 20: Train Loss=0.1609
Model training and saving completed!
Evaluating on Test Set...
Test Set Results:
🔹 Loss: 1.2302
🔹 Accuracy: 0.6364
🔹 Precision: 0.6220
🔹 Recall: 0.6364
🔹 F1-score: 0.6137


In [None]:
# STEP 1: Setup
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from tqdm import tqdm

os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# STEP 2: Load and preprocess data
formal_df = pd.read_csv("/content/drive/MyDrive/Diabetes_cleaned.csv").dropna(subset=["content", "label"])
informal_df = pd.read_csv("/content/drive/MyDrive/Corrected_Labeled.csv", encoding="ISO-8859-1").dropna(subset=["Text ", "Label"])

formal_df = formal_df.rename(columns={"content": "text", "label": "label"})
informal_df = informal_df.rename(columns={"Text ": "text", "Label": "label"})

formal_df["label"] = formal_df["label"].astype(int)
informal_df["label"] = informal_df["label"].astype(int)
formal_df["domain"] = 0
informal_df["domain"] = 1

combined_df = pd.concat([formal_df, informal_df], ignore_index=True)

# STEP 3: Tokenization and Dataset
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

class MisinformationDataset(Dataset):
    def __init__(self, texts, labels, domains, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.domains = domains.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            "domain_labels": torch.tensor(self.domains[idx], dtype=torch.long)
        }

train_texts, test_texts, train_labels, test_labels, train_domains, test_domains = train_test_split(
    combined_df["text"], combined_df["label"], combined_df["domain"],
    test_size=0.2, stratify=combined_df["label"], random_state=42
)

train_dataset = MisinformationDataset(train_texts, train_labels, train_domains, tokenizer)
test_dataset = MisinformationDataset(test_texts, test_labels, test_domains, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

#Class Weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

#Define Gradient Reversal Layer and Model
class GradientReversalFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.neg() * ctx.alpha, None

class GradientReversalLayer(nn.Module):
    def __init__(self, alpha=1.0):
        super().__init__()
        self.alpha = alpha
    def forward(self, x):
        return GradientReversalFunction.apply(x, self.alpha)

class BioBERT_DANN(nn.Module):
    def __init__(self, model_name, num_labels=3, alpha=0.1):
        super(BioBERT_DANN, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.label_classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.domain_classifier = nn.Linear(self.bert.config.hidden_size, 2)
        self.grl = GradientReversalLayer(alpha)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        label_logits = self.label_classifier(self.dropout(pooled_output))
        domain_logits = self.domain_classifier(self.grl(self.dropout(pooled_output)))
        return label_logits, domain_logits

# STEP 6: Train and Evaluate
model = BioBERT_DANN("dmis-lab/biobert-base-cased-v1.1").to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
task_loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
domain_loss_fn = nn.CrossEntropyLoss()

def train(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        domains = batch["domain_labels"].to(device)

        model.zero_grad()
        task_logits, domain_logits = model(input_ids, attention_mask)
        task_loss = task_loss_fn(task_logits, labels)
        dom_loss = domain_loss_fn(domain_logits, domains)
        loss = task_loss + 0.1 * dom_loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            task_logits, _ = model(input_ids, attention_mask)
            preds = torch.argmax(task_logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    print(classification_report(all_labels, all_preds, target_names=["True", "False", "Partially True"], digits=4))

# STEP 7: Train and Report
for epoch in range(3):
    loss = train(model, train_loader)
    print(f"Epoch {epoch+1} | Training Loss: {loss:.4f}")

print("\N Evaluation on Test Set:")
evaluate(model, test_loader)


Training: 100%|██████████| 159/159 [01:01<00:00,  2.58it/s]


Epoch 1 | Training Loss: 0.7931


Training: 100%|██████████| 159/159 [01:00<00:00,  2.64it/s]


Epoch 2 | Training Loss: 0.4547


Training: 100%|██████████| 159/159 [00:59<00:00,  2.66it/s]


Epoch 3 | Training Loss: 0.3058

📊 Evaluation on Test Set:
                precision    recall  f1-score   support

          True     0.9338    0.9129    0.9233       448
         False     0.9065    0.8129    0.8571       155
Partially True     0.3276    0.5938    0.4222        32

      accuracy                         0.8724       635
     macro avg     0.7226    0.7732    0.7342       635
  weighted avg     0.8966    0.8724    0.8819       635

