In [1]:
import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
from IPython.display import clear_output
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
#Preparing data

df_org = pd.read_excel("train.xlsx")
df_org.head()

Unnamed: 0,id,tweet_id,content,concerns
0,0,1361368042832289794t,@imaerell @BradfatherSpeak @ALEXNEWMAN_JOU New...,side-effect unnecessary
1,1,1326493920935686145t,"@theousherwood @LBC I’m not anti vaccine, but ...",pharma
2,2,1255282049486815235t,@BorisJohnson I won’t be taking any vaccine ev...,none
3,3,1354819682293833732t,@LPerrins They have set this up that nothing w...,ineffective
4,4,1328103483812302855t,@AngelaDeAngelo I believe I read it on one of ...,rushed


In [3]:
df2 = pd.DataFrame(columns=['id', 'tweet_id', 'content', 'concern_split'])

for y in range(0, len(df_org)):
    for x in df_org.loc[y, 'concerns'].split(' '):
        data_id = df_org.loc[y, 'id'] 
        twt_id = df_org.loc[y, 'tweet_id']
        cotnt = df_org.loc[y, 'content']
        df2 = pd.concat([df2, pd.DataFrame({'id' : [data_id], 'tweet_id': [twt_id], 'content': [cotnt], 'concern_split': [x]})], ignore_index=True)

df_org = pd.concat([df_org, df2], ignore_index=True)
df_org = df_org.dropna(subset=['concern_split'])
df_org = df_org.drop(columns=['concerns'])
df_org = df_org.reset_index(drop=True)
df_org.head()

Unnamed: 0,id,tweet_id,content,concern_split
0,0,1361368042832289794t,@imaerell @BradfatherSpeak @ALEXNEWMAN_JOU New...,side-effect
1,0,1361368042832289794t,@imaerell @BradfatherSpeak @ALEXNEWMAN_JOU New...,unnecessary
2,1,1326493920935686145t,"@theousherwood @LBC I’m not anti vaccine, but ...",pharma
3,2,1255282049486815235t,@BorisJohnson I won’t be taking any vaccine ev...,none
4,3,1354819682293833732t,@LPerrins They have set this up that nothing w...,ineffective


In [4]:
labels = df_org['concern_split'].unique().tolist()
labels = [str(s).strip() for s in labels ]

NUM_LABELS= len(labels)
id2label={concern_split:label for concern_split,label in enumerate(labels)}
label2id={label:concern_split for concern_split,label in enumerate(labels)}

df_org["labels"] = df_org.concern_split.map(lambda x: label2id[x.strip()])
df_org.head()

Unnamed: 0,id,tweet_id,content,concern_split,labels
0,0,1361368042832289794t,@imaerell @BradfatherSpeak @ALEXNEWMAN_JOU New...,side-effect,0
1,0,1361368042832289794t,@imaerell @BradfatherSpeak @ALEXNEWMAN_JOU New...,unnecessary,1
2,1,1326493920935686145t,"@theousherwood @LBC I’m not anti vaccine, but ...",pharma,2
3,2,1255282049486815235t,@BorisJohnson I won’t be taking any vaccine ev...,none,3
4,3,1354819682293833732t,@LPerrins They have set this up that nothing w...,ineffective,4


In [5]:
#Using BERT model to make prediction

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=512)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
SIZE= df_org.shape[0]
train_texts = list(df_org.content[:SIZE//2])
val_texts = list(df_org.content[SIZE//2:(3*SIZE)//4 ])
test_texts = list(df_org.content[(3*SIZE)//4:])
train_labels = list(df_org.labels[:SIZE//2])
val_labels = list(df_org.labels[SIZE//2:(3*SIZE)//4])
test_labels = list(df_org.labels[(3*SIZE)//4:])

len(train_texts), len(val_texts), len(test_texts)

(4125, 2062, 2063)

In [7]:
class DataLoader(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
train_dataloader = DataLoader(train_encodings, train_labels)
val_dataloader = DataLoader(val_encodings, val_labels)
test_dataset = DataLoader(test_encodings, test_labels)

In [9]:
def compute_metrics(pred):
    # Extract true labels from the input object
    labels = pred.label_ids
    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)
    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)
    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [10]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./Bert-covid', 
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0 
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory                 
    logging_dir='./multi-class-logs',            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps", 
    fp16=True,
    load_best_model_at_end=True
)

trainer =Trainer(
    # the pre-trained model that will be fine-tuned 
    model=model,
     # training arguments that we defined above                        
    args=training_args,                 
    train_dataset=train_dataloader,         
    eval_dataset=val_dataloader,            
    compute_metrics= compute_metrics
)

In [12]:
trainer.train()
model_path = "bert-covid-V2"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,2.2707,2.120111,0.306984,0.039161,0.025607,0.083202
100,2.0331,1.887048,0.36324,0.084111,0.106533,0.12284
150,1.8637,1.723808,0.417556,0.172132,0.207362,0.199474
200,1.7322,1.652264,0.487876,0.235242,0.304694,0.254723
250,1.7338,1.47026,0.536857,0.289266,0.29339,0.305914
300,1.3308,1.393724,0.556256,0.345493,0.392762,0.353216
350,1.3467,1.403833,0.548982,0.338389,0.399077,0.334981
400,1.3597,1.314953,0.57323,0.391228,0.565147,0.380915
450,1.3295,1.27603,0.580504,0.42423,0.528127,0.400359
500,1.2621,1.247529,0.588749,0.443708,0.541995,0.425116


Checkpoint destination directory ./Bert-covid\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


('bert-covid-V2\\tokenizer_config.json',
 'bert-covid-V2\\special_tokens_map.json',
 'bert-covid-V2\\vocab.txt',
 'bert-covid-V2\\added_tokens.json',
 'bert-covid-V2\\tokenizer.json')

In [17]:
model_path = "bert-covid-V2"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
def check_accuracy(data_check, data_test):
    acc_correct = 0
    for y in range (0, len(data_test)):
        check_id = data_test.loc[y, 'id']
        check_content = data_test.loc[y, 'content']
        output_nlp = nlp(check_content)
        data_label = output_nlp[0]
        check_string = data_label['label']
        for x in range (0, len(data_check)):
            if (check_id == data_check.loc[x, 'id']):
                for z in data_check.loc[x, 'concerns'].split(' '):
                    if (check_string == z):
                        acc_correct = acc_correct + 1
        clear_output(wait=True)
        print(check_id)
    clear_output(wait=True)
    print("Completed!!")
    acc_score = acc_correct*100/len(data_test)
    return acc_score

In [13]:
#Checking Accuracy

df_check = pd.read_excel("train.xlsx")

df_train = df_org[:SIZE//2]
df_train = df_train.reset_index()
df_val = df_org[SIZE//2:(3*SIZE)//4]
df_val = df_val.reset_index()
df_test = df_org[(3*SIZE)//4:]
df_test = df_test.reset_index()

In [None]:
acc_train = check_accuracy(df_check, df_train)
acc_val = check_accuracy(df_check, df_val)
acc_test = check_accuracy(df_check, df_test)

87


In [19]:
print(f"Train Dataset Accuracy: {acc_train}%")
print(f"Validate Dataset Accuracy: {acc_val}%")
print(f"Test Dataset Accuracy: {acc_test}%")

Train Dataset Accuracy: 87.36969696969697%
Validate Dataset Accuracy: 73.66634335596508%
Test Dataset Accuracy: 73.04895782840524%


In [27]:
data_sl = pd.read_excel("test.xlsx")
data_sl.head()

Unnamed: 0,id,tweet_id,content
0,0,1365819211206057993t,No thanks. Wonder if it will be as good as the...
1,1,1336397084891652097t,"@DrEricDing This vaccine is a farce, it’s stor..."
2,2,1333458562815844352t,@StefMylesTennis @disclosetv @CookieFreshPimp ...
3,3,1327908582403280896t,@DL7010 You obviously do not see it as a probl...
4,4,1374806491065155596t,@Potso_Sego Maybe good news. Just read an arti...


In [None]:
len_data = len(data_sl)
for y in range (0, len_data):
    input_nlp = data_sl.loc[y, 'content']
    output_nlp = nlp(input_nlp)
    data_label = output_nlp[0]
    label_string = data_label['label']
    data_sl.loc[y, 'concerns'] = label_string
    clear_output(wait=True)
    print(y)
clear_output(wait=True)
print("Completed!!")

In [29]:
data_sl.head()

Unnamed: 0,id,tweet_id,content
0,0,1365819211206057993t,No thanks. Wonder if it will be as good as the...
1,1,1336397084891652097t,"@DrEricDing This vaccine is a farce, it’s stor..."
2,2,1333458562815844352t,@StefMylesTennis @disclosetv @CookieFreshPimp ...
3,3,1327908582403280896t,@DL7010 You obviously do not see it as a probl...
4,4,1374806491065155596t,@Potso_Sego Maybe good news. Just read an arti...


In [21]:
data_sl.to_csv("output_singlelabel.csv", index=False)

In [22]:
def predict(text):
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label 
    # Since pred_label_idx is a tensor containing a single value (the predicted class index), 
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs

In [23]:
data_ml = pd.read_excel("test.xlsx")
len_data = len(data_ml)

for y in range (0, len_data):
    out_tensor = predict(data_ml.loc[y, 'content'])
    indices = torch.where(out_tensor > 0.200)[1]
    indices_list = indices.tolist()
    str_indices = [id2label[key] for key in indices_list if key in id2label]
    label_string = " ".join(str_indices)
    data_ml.loc[y, 'concerns'] = label_string
    clear_output(wait=True)
    print(y)
clear_output(wait=True)
print("Completed!!")

Completed!!


In [25]:
data_ml.head()

Unnamed: 0,id,tweet_id,content,concerns
0,0,1365819211206057993t,No thanks. Wonder if it will be as good as the...,ineffective rushed
1,1,1336397084891652097t,"@DrEricDing This vaccine is a farce, it’s stor...",ineffective
2,2,1333458562815844352t,@StefMylesTennis @disclosetv @CookieFreshPimp ...,ineffective
3,3,1327908582403280896t,@DL7010 You obviously do not see it as a probl...,side-effect rushed ingredients
4,4,1374806491065155596t,@Potso_Sego Maybe good news. Just read an arti...,side-effect


In [26]:
data_ml.to_csv("output_multilabel.csv", index=False)