In [1]:
from google.colab import files

uploaded_files = files.upload()  # This allows multiple file upload at once


Saving labels-levela.csv to labels-levela.csv
Saving test_a_labels_all.csv to test_a_labels_all.csv
Saving test_a_tweets_all.tsv to test_a_tweets_all.tsv
Saving testset-levela.tsv to testset-levela.tsv


In [37]:
!pip install transformers   # install Huggingface’s transformers library



In [38]:
import numpy as np
import pandas as pd
import torch
import time
import torch.nn as nn
from sklearn.model_selection import train_test_split     #test,validation,train
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast       # tokenizepunctuation,space, ids where word exist
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# paramters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    #gpu graphic processing unit #api to access gpu is cuda

In [39]:
df_olid_labels=pd.read_csv('labels-levela.csv')
df_solid_labels = pd.read_csv('test_a_labels_all.csv')
df_solid_testset = pd.read_table('test_a_tweets_all.tsv')
df_olid_testset = pd.read_table('testset-levela.tsv')
print(df_olid_labels.columns)
print(df_solid_labels.columns)
print(df_solid_testset.columns)
print(df_olid_testset.columns)#The columns in both datasets are not named properly
df_olid_labels.columns = ['id', 'label']
df_solid_labels.columns = ['id', 'label']

#check if column names are updated

print(df_olid_labels.columns)
print(df_solid_labels.columns)
print(df_solid_testset.columns)
print(df_olid_testset.columns)


Index(['15923', 'OFF'], dtype='object')
Index(['BC0', 'OFF'], dtype='object')
Index(['id', 'tweet'], dtype='object')
Index(['id', 'tweet'], dtype='object')
Index(['id', 'label'], dtype='object')
Index(['id', 'label'], dtype='object')
Index(['id', 'tweet'], dtype='object')
Index(['id', 'tweet'], dtype='object')


In [40]:
print(df_olid_labels['id'].isin(df_olid_testset['id']).value_counts())
print(df_solid_labels['id'].isin(df_solid_testset['id']).value_counts())


True    859
Name: id, dtype: int64
True    5992
Name: id, dtype: int64


**Data Preprocessing: **
This function lowercases the text, removes mentions and URLs, and removes special characters and numbers. You might need to modify this function based on our specific requirements. For example, if emojis or certain special characters are important for your task, you might want to keep them.

In [42]:
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # Lowercase the text
        text = text.lower()

        # Remove mentions
        text = re.sub(r'@[\w]*', '', text)

        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'[0-9]', '', text)
    else:
        text = ''  # If the input is not a string, return an empty string (or handle it in a way that suits your needs)
    return text


**Preparing Data For Bert Model:**
This will tokenize the tweets and add the special [CLS] and [SEP] tokens required by BERT. It also encodes the labels into a format that can be understood by our model (i.e., 0 and 1 for binary classification tasks).

In [43]:
import re
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder


# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Preprocess the tweets
df_olid_testset['tweet'] = df_olid_testset['tweet'].apply(preprocess_text)
df_solid_testset['tweet'] = df_solid_testset['tweet'].apply(preprocess_text)

# Tokenize the tweets
df_olid_testset['tweet'] = df_olid_testset['tweet'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
df_solid_testset['tweet'] = df_solid_testset['tweet'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Encode the labels
le = LabelEncoder()
df_olid_labels['label'] = le.fit_transform(df_olid_labels['label'])
df_solid_labels['label'] = le.fit_transform(df_solid_labels['label'])

**Splitting the Dataset:** This will create a training set and a validation set for both the OLID and SOLID datasets. The validation set is typically a smaller portion of the dataset that is used to evaluate the model during training.

In [44]:
# Merge the tweets and labels based on 'id'
df_olid = pd.merge(df_olid_testset, df_olid_labels, on='id')
df_solid = pd.merge(df_solid_testset, df_solid_labels, on='id')

# Split the OLID dataset
train_inputs_olid, validation_inputs_olid, train_labels_olid, validation_labels_olid = train_test_split(df_olid['tweet'].tolist(), df_olid['label'].tolist(), test_size=0.1, random_state=42)

# Split the SOLID dataset
train_inputs_solid, validation_inputs_solid, train_labels_solid, validation_labels_solid = train_test_split(df_solid['tweet'].tolist(), df_solid['label'].tolist(), test_size=0.1, random_state=42)


In [45]:
# For training inputs
train_masks_olid = [[float(i != 0.0) for i in ii] for ii in train_inputs_olid]
train_masks_solid = [[float(i != 0.0) for i in ii] for ii in train_inputs_solid]

# For validation inputs
validation_masks_olid = [[float(i != 0.0) for i in ii] for ii in validation_inputs_olid]
validation_masks_solid = [[float(i != 0.0) for i in ii] for ii in validation_inputs_solid]


Define Max length of sentences in dataset.

In [46]:
from keras.preprocessing.sequence import pad_sequences

# Define a maximum length for your sequences
MAX_LEN = 64

# Pad the sequences
train_inputs_olid = pad_sequences(train_inputs_olid, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
validation_inputs_olid = pad_sequences(validation_inputs_olid, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

train_inputs_solid = pad_sequences(train_inputs_solid, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
validation_inputs_solid = pad_sequences(validation_inputs_solid, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


**Create PyTorch DataLoaders:** These are used to efficiently load our data in batches during training.

In [47]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

# Convert data into torch tensors
train_inputs_olid = torch.tensor(train_inputs_olid)
validation_inputs_olid = torch.tensor(validation_inputs_olid)
train_labels_olid = torch.tensor(train_labels_olid)
validation_labels_olid = torch.tensor(validation_labels_olid)

train_inputs_solid = torch.tensor(train_inputs_solid)
validation_inputs_solid = torch.tensor(validation_inputs_solid)
train_labels_solid = torch.tensor(train_labels_solid)
validation_labels_solid = torch.tensor(validation_labels_solid)

# Create DataLoader for training data
train_data_olid = TensorDataset(train_inputs_olid, train_labels_olid)
train_sampler_olid = RandomSampler(train_data_olid)
train_dataloader_olid = DataLoader(train_data_olid, sampler=train_sampler_olid, batch_size=32)

train_data_solid = TensorDataset(train_inputs_solid, train_labels_solid)
train_sampler_solid = RandomSampler(train_data_solid)
train_dataloader_solid = DataLoader(train_data_solid, sampler=train_sampler_solid, batch_size=32)

# Create DataLoader for validation data
validation_data_olid = TensorDataset(validation_inputs_olid, validation_labels_olid)
validation_sampler_olid = SequentialSampler(validation_data_olid)
validation_dataloader_olid = DataLoader(validation_data_olid, sampler=validation_sampler_olid, batch_size=32)

validation_data_solid = TensorDataset(validation_inputs_solid, validation_labels_solid)
validation_sampler_solid = SequentialSampler(validation_data_solid)
validation_dataloader_solid = DataLoader(validation_data_solid, sampler=validation_sampler_solid, batch_size=32)


**Setup Model Optimizer**

In [48]:
from transformers import BertForSequenceClassification, AdamW

# Load the BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the base BERT model
    num_labels = 2, # The number of output labels--2 for binary classification
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU
model.cuda()

# Set up the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,  # Learning Rate - Default is 5e-5
                  eps = 1e-8 # Adam Epsilon  - Default is 1e-8.
                )


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Define Training Loop** train  model for 4 epochs. It calculates the training loss for each epoch and prints the validation accuracy after each epoch. You can adjust the number of epochs based on your requirements.

In [49]:
from transformers import get_linear_schedule_with_warmup

# Define the training parameters
epochs = 8
total_steps = len(train_dataloader_olid) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Initialize lists to store the loss values for each epoch
train_losses = []
valid_losses = []

# For each epoch...
for epoch_i in range(0, epochs):
    # Training
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader_olid):
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, labels=b_labels)
        loss = outputs[0]

        # L1 regularization
        l1_lambda = 0.0001
        l1_norm = sum(p.abs().sum() for p in model.parameters())
        loss += l1_lambda * l1_norm

        # L2 regularization
        l2_lambda = 0.0001
        l2_norm = sum(p.pow(2.0).sum() for p in model.parameters())
        loss += l2_lambda * l2_norm

        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader_olid)
    train_losses.append(avg_train_loss)
    print(f"Average training loss: {avg_train_loss:.3f}")

    # Validation
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    for batch in validation_dataloader_olid:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, labels=b_labels)
        loss, logits = outputs[:2]  # Unpack the outputs to get the loss and logits separately
        eval_loss += loss.item()  # Use the loss instead of logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    avg_valid_loss = eval_loss / len(validation_dataloader_olid)
    valid_losses.append(avg_valid_loss)
    print(f"Validation Accuracy: {eval_accuracy/nb_eval_steps:.2f}")
    print(f"Validation Loss: {avg_valid_loss:.3f}")


Average training loss: 375.853
Validation Accuracy: 0.80
Validation Loss: 0.516
Average training loss: 371.649
Validation Accuracy: 0.80
Validation Loss: 0.565
Average training loss: 368.318
Validation Accuracy: 0.80
Validation Loss: 0.514
Average training loss: 365.213
Validation Accuracy: 0.80
Validation Loss: 0.512
Average training loss: 362.599
Validation Accuracy: 0.80
Validation Loss: 0.514
Average training loss: 360.742
Validation Accuracy: 0.80
Validation Loss: 0.501
Average training loss: 359.447
Validation Accuracy: 0.80
Validation Loss: 0.501
Average training loss: 358.840
Validation Accuracy: 0.80
Validation Loss: 0.500


In [50]:
# Define a helper function for calculating accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten() if preds.ndim > 1 else preds.flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Number of training epochs
epochs = 8

# For each epoch...
for epoch_i in range(0, epochs):
    # Training
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader_olid):
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader_olid)
    print(f"Average training loss: {avg_train_loss:.3f}")

    # Validation
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    for batch in validation_dataloader_olid:
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_labels = batch
          with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, labels=b_labels)  # Include labels to get the loss in the outputs
          loss, logits = outputs[:2]  # Unpack the outputs to get the loss and logits separately
          eval_loss += loss.item()  # Use the loss instead of logits
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          eval_accuracy += tmp_eval_accuracy
          nb_eval_steps += 1


    avg_valid_loss = eval_loss / len(validation_dataloader_olid)
    print(f"Validation Accuracy: {eval_accuracy/nb_eval_steps:.2f}")
    print(f"Validation Loss: {avg_valid_loss:.3f}")


Average training loss: 0.536
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.565
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.546
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.553
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.551
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.549
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.549
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.555
Validation Accuracy: 0.80
Validation Loss: 0.500


**Final Evaluation of Model.**

In [None]:
# Number of training epochs
epochs = 6

# Dataloaders for both datasets
dataloaders = [(train_dataloader_olid, validation_dataloader_olid), (train_dataloader_solid, validation_dataloader_solid)]

# For each dataset...
for train_dataloader, validation_dataloader in dataloaders:
    # For each epoch...
    for epoch_i in range(0, epochs):
        # Training
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)
            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, labels=b_labels)
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.3f}")

        # Validation
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps = 0
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_labels = batch
            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, labels=b_labels)
            loss, logits = outputs[:2]
            eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        avg_valid_loss = eval_loss / len(validation_dataloader)
        print(f"Validation Accuracy: {eval_accuracy/nb_eval_steps:.2f}")
        print(f"Validation Loss: {avg_valid_loss:.3f}")


Average training loss: 0.560
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.546
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.548
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.550
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.552
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.550
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.561
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.553
Validation Accuracy: 0.80
Validation Loss: 0.500
Average training loss: 0.704
Validation Accuracy: 0.50
Validation Loss: 0.710
Average training loss: 0.705
Validation Accuracy: 0.50
Validation Loss: 0.710
Average training loss: 0.702
Validation Accuracy: 0.50
Validation Loss: 0.710
