In [12]:
!pip install transformers



In [13]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/ml/sem7_project/dataset_updated.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
df=df.rename(columns={'newlabel':'label'})
df=df.rename(columns={'tweet':'text'})

In [15]:
import numpy as np
import ast

In [16]:
df['label'] = df['label'].apply(lambda x: ast.literal_eval(x))


In [17]:
from sklearn.preprocessing import MultiLabelBinarizer
label_binarizer = MultiLabelBinarizer()
yt=label_binarizer.fit_transform(df['label'])
yt.shape

(9921, 12)

In [18]:
label_binarizer.classes_

array(['conspiracy', 'country', 'ineffective', 'ingredients', 'mandatory',
       'none', 'pharma', 'political', 'religious', 'rushed',
       'side-effect', 'unnecessary'], dtype=object)

In [19]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].values, yt, test_size=0.2, random_state=42
)

In [20]:
print(train_labels.shape)
print(val_labels.shape)


(7936, 12)
(1985, 12)


DISTILBERT MODEL

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.cuda.amp import GradScaler, autocast

# Load and preprocess your dataset
# Assuming you have a DataFrame df with columns 'text' and 'label'

# df['label'] = df['label'].astype(int)

# Split the dataset into training and validation sets
# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     df['text'].values, yt, test_size=0.2, random_state=42
# )

# Tokenize the data for DistilBERT
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_train_encodings = distilbert_tokenizer(train_texts.tolist(), truncation=True, padding=True)
distilbert_val_encodings = distilbert_tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Convert to PyTorch tensors
distilbert_train_dataset = TensorDataset(
    torch.tensor(distilbert_train_encodings['input_ids']),
    torch.tensor(distilbert_train_encodings['attention_mask']),
    torch.tensor(train_labels)
)

distilbert_val_dataset = TensorDataset(
    torch.tensor(distilbert_val_encodings['input_ids']),
    torch.tensor(distilbert_val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Create DataLoader for DistilBERT
distilbert_train_loader = DataLoader(distilbert_train_dataset, batch_size=16, shuffle=True)
distilbert_val_loader = DataLoader(distilbert_val_dataset, batch_size=16, shuffle=False)

# Define and train DistilBERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=12)
distilbert_model.to(device)

# Hyperparameter tuning
epochs = 5  # Increase the number of epochs for tuning
learning_rate = 5e-5  # Initial learning rate

# Learning rate scheduler
scheduler_step_size = 2  # Adjust the step size as needed
scheduler_gamma = 0.9  # Adjust the gamma factor as needed
distilbert_optimizer = AdamW(distilbert_model.parameters(), lr=learning_rate)
distilbert_scheduler = StepLR(distilbert_optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)

scaler = GradScaler()
distilbert_model.train()

for epoch in range(epochs):
    for batch in distilbert_train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        distilbert_optimizer.zero_grad()
        with autocast():
            outputs = distilbert_model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, torch.argmax(labels, dim=1))
        scaler.scale(loss).backward()
        scaler.step(distilbert_optimizer)
        scaler.update()

    # Learning rate scheduler step
    distilbert_scheduler.step()

# Validate the DistilBERT model
distilbert_model.eval()
distilbert_predictions = []
with torch.no_grad():
    for batch in distilbert_val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        distilbert_predictions.extend(predictions.tolist())



In [None]:
from sklearn.preprocessing import LabelBinarizer

# Convert the labels from multilabel-indicator format to multiclass format
lb = LabelBinarizer()
distilbert_predictions_multiclass = lb.fit_transform(distilbert_predictions)
# Calculate accuracy and other metrics
classification_rep = classification_report(val_labels, distilbert_predictions_multiclass)
# Calculate the accuracy score
distilbert_accuracy = accuracy_score(val_labels, distilbert_predictions_multiclass)
# Print the metrics
print(f'DistilBERT Accuracy: {distilbert_accuracy}')
print('Classification Report:\n', classification_rep)



Loading the distilbert and predicting labels

In [10]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np

# Load the pre-trained DistilBERT tokenizer and model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('/content/drive/My Drive/ml/sem7_project/distilbert_model_2')

# Function to predict with DistilBERT model
def predict_sentence_distilbert(sentence):
    # Tokenize the input sentence
    inputs = distilbert_tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
        logits = outputs.logits

    probabilities = torch.sigmoid(logits)
    threshold = 0.7
    predicted_labels = (probabilities > threshold).numpy()

    return predicted_labels, probabilities.numpy()

input_sentence = input()
predicted_labels, confidences = predict_sentence_distilbert(input_sentence)
predicted_label_names = label_binarizer.classes_[predicted_labels[0]]

# Print the result
print(f"Predicted Label: {predicted_label_names}")
print(f"Confidences: {confidences}")


As a machine learning student, I advise making decisions based on scientifically validated information. Rejecting the COVID-19 vaccine without credible evidence can contribute to misinformation and hinder efforts to combat the pandemic.
Predicted Label: ['ineffective' 'rushed' 'side-effect']
Confidences: [[0.12102917 0.01621871 0.95444363 0.36014846 0.10785212 0.10589144
  0.43793464 0.06599262 0.00529458 0.90601784 0.97384304 0.5222186 ]]


In [25]:
torch.cuda.empty_cache()

**CT-BERT**

In [24]:
# ct-bert
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.cuda.amp import GradScaler, autocast
from transformers import BertForPreTraining

# Load and preprocess your dataset
# Assuming you have a DataFrame df with columns 'text' and 'label'

# df['label'] = df['label'].astype(int)

# Split the dataset into training and validation sets
# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     df['text'].values, yt, test_size=0.2, random_state=42
# )

# Tokenize the data for CT-BERT
ctbert_tokenizer = BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')
ctbert_train_encodings = ctbert_tokenizer(train_texts.tolist(), truncation=True, padding=True)
ctbert_val_encodings = ctbert_tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Convert to PyTorch tensors
ctbert_train_dataset = TensorDataset(
    torch.tensor(ctbert_train_encodings['input_ids']),
    torch.tensor(ctbert_train_encodings['attention_mask']),
    torch.tensor(train_labels)
)

ctbert_val_dataset = TensorDataset(
    torch.tensor(ctbert_val_encodings['input_ids']),
    torch.tensor(ctbert_val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Create DataLoader for CT-BERT
ctbert_train_loader = DataLoader(ctbert_train_dataset, batch_size=16, shuffle=True)
ctbert_val_loader = DataLoader(ctbert_val_dataset, batch_size=16, shuffle=False)

# Define and train CT-BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ctbert_model = BertForSequenceClassification.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2', num_labels=12)
ctbert_model.to(device)

# Hyperparameter tuning
epochs = 5  # Increase the number of epochs for tuning
learning_rate = 5e-5  # Initial learning rate

# Learning rate scheduler
scheduler_step_size = 2  # Adjust the step size as needed
scheduler_gamma = 0.9  # Adjust the gamma factor as needed
ctbert_optimizer = AdamW(ctbert_model.parameters(), lr=learning_rate)
ctbert_scheduler = StepLR(ctbert_optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)

scaler = GradScaler()
ctbert_model.train()

for epoch in range(epochs):
    for batch in ctbert_train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        ctbert_optimizer.zero_grad()
        with autocast():
            outputs = ctbert_model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, torch.argmax(labels, dim=1))
        scaler.scale(loss).backward()
        scaler.step(ctbert_optimizer)
        scaler.update()

    # Learning rate scheduler step
    ctbert_scheduler.step()

# Validate the CT-BERT model
ctbert_model.eval()
ctbert_predictions = []
with torch.no_grad():
    for batch in ctbert_val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = ctbert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        ctbert_predictions.extend(predictions.tolist())

# Calculate accuracy and other metrics
# ctbert_accuracy = accuracy_score(val_labels, ctbert_predictions)
# classification_rep_ctbert = classification_report(val_labels, ctbert_predictions)

# Print the metrics
# print(f'CT-BERT Accuracy: {ctbert_accuracy}')
# print('Classification Report:\n', classification_rep_ctbert)

# Save the trained CT-BERT model if needed
# ctbert_model.save_pretrained('/content/drive/My Drive/ml/sem7_project/ctbert_model')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: ignored

In [23]:
ctbert_model.save_pretrained('/content/drive/My Drive/ml/sem7_project/ctbert_model')

**XLNET**

In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast

# Tokenize the data for XLNet
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_train_encodings = xlnet_tokenizer(train_texts.tolist(), truncation=True, padding=True)
xlnet_val_encodings = xlnet_tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Convert to PyTorch tensors
xlnet_train_dataset = TensorDataset(
    torch.tensor(xlnet_train_encodings['input_ids']),
    torch.tensor(xlnet_train_encodings['attention_mask']),
    torch.tensor(train_labels)
)

xlnet_val_dataset = TensorDataset(
    torch.tensor(xlnet_val_encodings['input_ids']),
    torch.tensor(xlnet_val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Create DataLoader for XLNet
xlnet_train_loader = DataLoader(xlnet_train_dataset, batch_size=16, shuffle=True)
xlnet_val_loader = DataLoader(xlnet_val_dataset, batch_size=16, shuffle=False)

# Define and train XLNet model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
xlnet_model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=12)
xlnet_model.to(device)

# Hyperparameter tuning for XLNet
epochs = 5  # Increase the number of epochs for tuning
learning_rate = 2e-5  # Initial learning rate for XLNet

# Learning rate scheduler for XLNet
warmup_steps = 500  # Adjust the warmup steps as needed
total_steps = len(xlnet_train_loader) * epochs
xlnet_optimizer = AdamW(xlnet_model.parameters(), lr=learning_rate)
xlnet_scheduler = get_linear_schedule_with_warmup(xlnet_optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

scaler = GradScaler()
xlnet_model.train()

for epoch in range(epochs):
    for batch in xlnet_train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        xlnet_optimizer.zero_grad()
        with autocast():
            outputs = xlnet_model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, torch.argmax(labels, dim=1))
        scaler.scale(loss).backward()
        scaler.step(xlnet_optimizer)
        scaler.update()

    # Learning rate scheduler step
    xlnet_scheduler.step()

# Validate the XLNet model
xlnet_model.eval()
xlnet_predictions = []
with torch.no_grad():
    for batch in xlnet_val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = xlnet_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        xlnet_predictions.extend(predictions.tolist())

# Convert the labels from multilabel-indicator format to multiclass format for evaluation
xlnet_accuracy = accuracy_score(val_labels, xlnet_predictions)
classification_rep_xlnet = classification_report(val_labels, xlnet_predictions)

# Print the metrics for XLNet
print(f'XLNet Accuracy: {xlnet_accuracy}')
print('Classification Report:\n', classification_rep_xlnet)
