# Import Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
test_path = '/content/drive/MyDrive/NNDL_HW5/Test.csv'
train_path = '/content/drive/MyDrive/NNDL_HW5/Train.csv'
val_path = '/content/drive/MyDrive/NNDL_HW5/Val.csv'

import pandas as pd

test_df = pd.read_csv(test_path)
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)


# train_df.head()
# val_df.head()
# test_df.sample(20)

In [3]:
train_df['label'] = train_df['label'].map({"real" : 1 , "fake" : 0})
val_df['label'] = val_df['label'].map({"real" : 1 , "fake" : 0})

train_df

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,1
1,2,States reported 1121 deaths a small rise from ...,1
2,3,Politically Correct Woman (Almost) Uses Pandem...,0
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,5,Populous states can generate large case counts...,1
...,...,...,...
6415,6416,A tiger tested positive for COVID-19 please st...,0
6416,6417,???Autopsies prove that COVID-19 is??� a blood...,0
6417,6418,_A post claims a COVID-19 vaccine has already ...,0
6418,6419,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,0


# Preprocess Data

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as functional
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, AdamW, BertConfig
import gc
from transformers import BertModel
from sklearn.metrics import roc_auc_score,f1_score
import time
import datetime

In [5]:
data = pd.concat([train_df , val_df], axis=0, ignore_index=True).drop(["id"], axis=1)
data

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,1
1,States reported 1121 deaths a small rise from ...,1
2,Politically Correct Woman (Almost) Uses Pandem...,0
3,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,Populous states can generate large case counts...,1
...,...,...
8555,Donald Trump wrongly claimed that New Zealand ...,0
8556,Current understanding is #COVID19 spreads most...,1
8557,Nothing screams “I am sat around doing fuck al...,0
8558,Birx says COVID-19 outbreak not under control ...,0


In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
import re
from string import punctuation

def preprocess(data):
    #remove url and hashtag
    for i in range(data.shape[0]):
        text=data[i].lower()
        text1=''.join([word+" " for word in text.split()])
        data[i]=text1
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    space_pattern = '\s+'

    for i in range(data.shape[0]):
        text_string = data[i]
        parsed_text = re.sub(hashtag_regex, '', text_string)
        parsed_text = re.sub(giant_url_regex, '', parsed_text)
        parsed_text = re.sub(mention_regex, '', parsed_text)
        #remove punctuation
        parsed_text = re.sub(r"[{}]+".format(punctuation), '', parsed_text)
        parsed_text = re.sub(space_pattern, ' ', parsed_text)
        data[i] = parsed_text
    return data

tweets = data.tweet.values
tweets = preprocess(tweets)
print(tweets)

['the cdc currently reports 99031 deaths in general the discrepancies in death counts between different sources are small and explicable the death toll stands at roughly 100000 people today '
 'states reported 1121 deaths a small rise from last tuesday southern states reported 640 of those deaths '
 'politically correct woman almost uses pandemic as excuse not to reuse plastic bag '
 ...
 'nothing screams “i am sat around doing fuck all during lockdown” quite like confident assumption that other people are sat around doing fuck all during lockdown '
 'birx says covid19 outbreak not under control because ‘people are on the move’ '
 'another 4422 new coronavirus cases have been confirmed in the uk the highest daily number since 8 may its up from 4322 new cases reported on friday and the overall total nationwide now stands at 385936 read the latest here ']


In [8]:
tweets = data.tweet.values
labels = data.label.values

In [9]:
input_ids = []
attention_masks = []
for tweet in tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,
                        # Sentence to encode.
                        add_special_tokens = True,
                        # Add '[CLS]' and '[SEP]'
                        max_length = 512,
                        # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        # Construct attn. masks.
                        return_tensors = 'pt',
                        # Return pytorch tensors.
                   )

    input_ids.append(encoded_dict['input_ids'])

    attention_masks.append(encoded_dict['attention_mask'])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


print('Original: ', tweets[10])
print('Token IDs:', input_ids[10])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  take simple daily precautions to help prevent the spread of respiratory illnesses like learn how to protect yourself from coronavirus covid19 
Token IDs: tensor([  101,  2202,  3722,  3679, 29361,  2000,  2393,  4652,  1996,  3659,
         1997, 16464, 24757,  2066,  4553,  2129,  2000,  4047,  4426,  2013,
        21887, 23350,  2522, 17258, 16147,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0

# Train and Test

In [10]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size],generator=torch.Generator().manual_seed(42))

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

7,704 training samples
  856 validation samples


In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
            train_dataset,
            shuffle = True,
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            shuffle = False,
            batch_size = batch_size
        )

In [12]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased",
#     num_labels = 2,
#     output_attentions = False,
#     output_hidden_states = False,
# )

from transformers import AutoTokenizer, \
BertTokenizer , AutoModel


tokenizer = AutoTokenizer.\
from_pretrained("digitalepidemiologylab/covid-twitter-bert" ,
num_labels = 2
)

model = AutoModel.\
from_pretrained("digitalepidemiologylab/covid-twitter-bert" ,
                  num_labels = 2,
                  output_attentions = False,
                  output_hidden_states = False
                  )

model.to(device)



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [14]:
optimizer = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8
                )
epochs = 4
criterion = nn.CrossEntropyLoss()



# CTBert

In [15]:
class CTBertGRUClassifier(nn.Module):
    def __init__(self, model_tune):
        super().__init__()
        self.ctbert = model_tune
        self.gru = nn.GRU(input_size = 768,
                            hidden_size = 768,
                            num_layers = 1,
                            batch_first = True,
                            bidirectional = True)
        self.classifier = nn.Linear(768 * 2, 2)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_ids, attention_mask,
                token_type_ids ):
        bert_output = self.ctbert(input_ids = input_ids,
                                  attention_mask = attention_mask,
                                  token_type_ids = token_type_ids)

        out, _ = self.gru(bert_output[0])
        logits = self.classifier(out[:, 1, :])
        return self.softmax(logits)

In [16]:
model2 = CTBertGRUClassifier(model)

In [None]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
best_accuracy = 0


for epoch_i in range(0, epochs):
    #Training
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model2.train()

    for step, batch in enumerate(train_dataloader):
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)

        print(f"Input size: {input_ids.size(-1)}")  # Should print 768


        model2.zero_grad()
        out = model2(input_ids,
                    attention_mask = input_mask ,
                     token_type_ids = None
                     )

        loss = criterion(out , labels)
        logits = out[1]
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model2.parameters(), 1.0)
        optimizer.step()

        pred = torch.argmax(out , dim = 1)
        total_train_accuracy +=  torch.sum(pred == labels).item()

    avg_train_accuracy = total_train_accuracy / len(train_dataloader.dataset)
    avg_train_loss = total_train_loss / len(train_dataloader.dataset)
    print("  Accuracy: {}".format(avg_train_accuracy))
    print("  Training loss: {}".format(avg_train_loss))


    # Validation
    print("")
    print("Validation...")
    model2.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    y_true = []
    y_pred = []

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            out = model2(input_ids, token_type_ids=None, attention_mask=input_mask,labels=labels)
            loss = out[0]
            logits = out[1]

        total_eval_loss += loss.item()
        pred = torch.argmax(logits, dim = 1)
        total_eval_accuracy += torch.sum(pred == labels).item()
        y_true.append(labels.flatten())
        y_pred.append(pred.flatten())


    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)
    print("  Accuracy: {}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)
    print("  Validation loss: {}".format(avg_val_loss))
    training_time = format_time(time.time() - t0)
    print()

    y_true = torch.cat(y_true).tolist()
    y_pred = torch.cat(y_pred).tolist()
    print("This epoch took: {:}".format(training_time))
    print('roc_auc score: ', roc_auc_score(y_true,y_pred))
    print('F1 score:',f1_score(y_true, y_pred))
    print()

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Train Accur.': avg_train_accuracy,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
        }
    )

    print()

    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        best_model = model

print()
print("="*10)
print("Summary")
print("Total time {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Epoch 1 / 4
Training...
Input size: 512


In [None]:
# import torch
# from transformers import CTBertForSequenceClassification, CTBertTokenizer
# from torch.utils.data import DataLoader, Dataset
# from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
# import matplotlib.pyplot as plt
# import numpy as np

# # Define the CTBert model
# model = CTBertForSequenceClassification.from_pretrained('ctbert-base')
# tokenizer = CTBertTokenizer.from_pretrained('ctbert-base')



# # Training loop
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
# loss_fn = torch.nn.CrossEntropyLoss()

# num_epochs = 5
# train_losses = []
# valid_losses = []
# accuracies = []

# for epoch in range(num_epochs):
#     model.train()
#     for step , batch in enumerate(train_dataloader):
#         optimizer.zero_grad()

#         input_ids = batch[0].to(device)
#         input_mask = batch[1].to(device)
#         labels = batch[2].to(device)

#         # input_ids = tokenizer(batch['tweet'], padding=True, truncation=True, return_tensors='pt').input_ids
#         # labels = torch.tensor(batch['label'])

#         outputs = model(input_ids, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         train_losses.append(loss.item())

#     model.eval()
#     with torch.no_grad():
#         valid_loss = 0.0
#         predictions = []
#         true_labels = []
#         for step , batch in enumerate(validation_dataloader):
#             # input_ids = tokenizer(batch['tweet'], padding=True, truncation=True, return_tensors='pt').input_ids
#             # labels = torch.tensor(batch['label'])

#             input_ids = batch[0].to(device)
#             input_mask = batch[1].to(device)

#             outputs = model(input_ids, labels=labels)
#             valid_loss += outputs.loss.item()
#             predictions.extend(outputs.logits.argmax(dim=-1).tolist())
#             true_labels.extend(labels.tolist())

#         valid_losses.append(valid_loss / len(validation_dataloader))
#         accuracy = accuracy_score(true_labels, predictions)
#         accuracies.append(accuracy)

# # Plot accuracy and loss based on epoch
# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# plt.plot(range(1, num_epochs + 1), accuracies)
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.title('Accuracy vs. Epoch')

# plt.subplot(1, 2, 2)
# plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
# plt.plot(range(1, num_epochs + 1), valid_losses, label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.title('Loss vs. Epoch')

# plt.show()

# # Test on test dataset
# # test_dataset = CustomDataset(test_data)
# test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# model.eval()
# predictions = []
# true_labels = []
# with torch.no_grad():
#     for batch in test_loader:
#         input_ids = tokenizer(batch['tweet'], padding=True, truncation=True, return_tensors='pt').input_ids
#         labels = torch.tensor(batch['label'])
#         outputs = model(input_ids)
#         predictions.extend(outputs.logits.argmax(dim=-1).tolist())
#         true_labels.extend(labels.tolist())

# # Confusion matrix
# conf_matrix = confusion_matrix(true_labels, predictions)
# print('Confusion Matrix:')
# print(conf_matrix)

# # Precision and Recall
# precision = precision_score(true_labels, predictions)
# recall = recall_score(true_labels, predictions)

# print(f'Precision: {precision}')
# print(f'Recall: {recall}')
