In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import string
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import precision_recall_fscore_support
import time
from datetime import datetime

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


#### reading data

In [None]:
path = "drive/MyDrive/Colab/NLP_project/"
training_data = [json.loads(event) for event in open(path+'project-data/train.data.jsonl', "r").readlines()]
dev_data = [json.loads(event) for event in open(path+'project-data/dev.data.jsonl', "r").readlines()]
test_data = [json.loads(event) for event in open(path+'project-data/test.data.jsonl', "r").readlines()]

train_labels = json.load(open(path+'project-data/train.label.json', "r"))
dev_labels = json.load(open(path+'project-data/dev.label.json', "r"))

train_labels = np.array([(1 if train_labels[id_str]=='rumour' else 0) for id_str in train_labels])
dev_labels = np.array([(1 if dev_labels[id_str]=='rumour' else 0) for id_str in dev_labels])

test_ids = [test_data[i][0]["id_str"] for i in range(len(test_data))]

#### preprocessing

##### sort by date for each event

In [None]:
def to_date(date_str):
    return datetime.strftime(datetime.strptime(date_str,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')

training_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in training_data]
dev_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in dev_data]
test_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in test_data]

##### extract specific attributes(default:text) from twitter dataset

In [None]:
def extract_info(data, info="text"):
    res = []
    for i in range(len(data)):
        event = data[i]
        event_info = []
        for tw in event:
            event_info.append(tw[info])
        res.append(event_info)
    return res

training_sents = extract_info(training_data)     # {event}  where event={source,apply1,apply2,...}
dev_sents = extract_info(dev_data)
test_sents = extract_info(test_data)

In [None]:
training_sources0 = [event[0] for event in training_sents]  # [event]  where event={source}
dev_sources0 = [event[0] for event in dev_sents]
test_sources0 = [event[0] for event in test_sents]

##### filter url

In [None]:
# remove url
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

training_sources00 = [remove_urls(s) for s in training_sources0]
dev_sources00 = [remove_urls(s) for s in dev_sources0]
test_sources00 = [remove_urls(s) for s in test_sources0]

##### word tokenization

In [None]:
training_sources0 = [a+b for a,b in training_all0]
dev_sources0 = [a+b for a,b in dev_all0]
test_sources0 = [a+b for a,b in test_all0]

In [None]:
# word tokenization
def tokenization(sent):
    return [w for w in word_tokenize(sent) if w not in string.punctuation]

training_sources1 = [tokenization(s) for s in training_sources0]             
dev_sources1 = [tokenization(s) for s in dev_sources0]
test_sources1 = [tokenization(s) for s in test_sources0]

##### stopword removal + lowercase

In [None]:
# stopwords removal + lowercase
def stop_words_removal(sent):
    stop_words = stopwords.words('english')
    new_sent = []
    for w in sent:
        w = w.lower()
        if w not in stop_words:
            new_sent.append(w)
    return new_sent

training_sources2 = [stop_words_removal(s) for s in training_sources1]
dev_sources2 = [stop_words_removal(s) for s in dev_sources1]
test_sources2 = [stop_words_removal(s) for s in test_sources1]

##### lemmatization

In [None]:
# lemmatization
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatization(sent):
    return [lemmatizer.lemmatize(word) for word in sent]

training_sources3 = [lemmatization(s) for s in training_sources2]
dev_sources3 = [lemmatization(s) for s in dev_sources2]
test_sources3 = [lemmatization(s) for s in test_sources2]

#### model

##### tf-idf + logistic regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

training_sources4 = [" ".join(words) for words in training_sources3]
dev_sources4 = [" ".join(words) for words in dev_sources3]
test_sources4 = [" ".join(words) for words in test_sources3]

# tf-idf 
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(training_sources4)

tfidf_training_sources = tfidf_vectorizer.transform(training_sources4)
tfidf_dev_sources = tfidf_vectorizer.transform(dev_sources4)
tfidf_test_sources = tfidf_vectorizer.transform(test_sources4)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import precision_recall_fscore_support
# tf_clf = RandomForestClassifier()
# tf_clf.fit(tfidf_training_sources, train_labels)
# preds = tf_clf.predict(tfidf_dev_sources)
# precision_recall_fscore_support(dev_labels, preds)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(tfidf_training_sources, train_labels)
preds = log_reg.predict(tfidf_dev_sources)
precision_recall_fscore_support(dev_labels, preds)

(array([0.81758242, 0.832     ]),
 array([0.94656489, 0.55614973]),
 array([0.87735849, 0.66666667]),
 array([393, 187]))

In [None]:
# train set
preds = log_reg.predict(tfidf_training_sources)
preds = [("rumour" if pred==1 else "non-rumour") for pred in preds]
preds_dict = dict(zip(train_label.keys(), preds))

with open("preds/lr_train.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


In [None]:
# dev set
preds = log_reg.predict(tfidf_dev_sources)
preds = [("rumour" if pred==1 else "non-rumour") for pred in preds]
preds_dict = dict(zip(dev_label.keys(), preds))

with open("preds/lr_dev.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


##### tfidf + svc

In [None]:
from sklearn.svm import SVC
# svc = SVC(C=1.5, tol=1.5, kernel="poly", degree=2)   # 0.85
svc = SVC(C=3, tol=1.6)                              # 0.847
svc.fit(tfidf_training_sources, train_labels)
preds = svc.predict(tfidf_dev_sources)
precision_recall_fscore_support(dev_labels, preds)

(array([0.87626263, 0.75      ]),
 array([0.88295165, 0.73796791]),
 array([0.87959442, 0.74393531]),
 array([393, 187]))

In [None]:
# on dev set
preds = svc.predict(tfidf_dev_sources)
preds = [("rumour" if pred==1 else "non-rumour") for pred in preds]
preds_dict = dict(zip(test_ids, preds))

with open("svc_dev.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


In [None]:
# on training set
preds = svc.predict(tfidf_training_sources)
preds = [("rumour" if pred==1 else "non-rumour") for pred in preds]
preds_dict = dict(zip(train_label.keys(), preds))

with open("preds/svc_train.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


In [None]:
# on test set
preds = svc.predict(tfidf_test_sources)
preds = [("rumour" if pred==1 else "non-rumour") for pred in preds]
preds_dict = dict(zip(test_ids, preds))

with open("svc_test.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


In [None]:
preds = svc.predict(tfidf_dev_sources)
preds = [(1 if pred==1 else 0) for pred in preds]
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(dev_labels, preds)

(array([0.92269327, 0.87150838]),
 array([0.94147583, 0.8342246 ]),
 array([0.93198992, 0.85245902]),
 array([393, 187]))

In [1]:
# grid search
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[60,55,50,10,5], 'tol':[1e-2,1e-1,1]}
svc = SVC()
clf = GridSearchCV(svc, param_grid=param_grid,  scoring="f1", n_jobs=-1)
clf.fit(tfidf_training_sources, train_labels)

###### cv_results

In [None]:
print(clf.best_params_)
pd.DataFrame(clf.cv_results_)

{'C': 50, 'tol': 1}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.310851,0.15828,0.930134,0.093922,50.0,0.001,"{'C': 50, 'tol': 0.001}",0.783471,0.792321,0.784854,0.783362,0.775221,0.783846,0.005433,7
1,3.741047,0.104816,0.95338,0.048291,50.0,0.1,"{'C': 50, 'tol': 0.1}",0.785479,0.794425,0.788296,0.785467,0.777385,0.78621,0.005492,4
2,2.000468,0.109522,0.565687,0.028922,50.0,1.0,"{'C': 50, 'tol': 1}",0.802528,0.794658,0.787479,0.806612,0.782462,0.794748,0.009001,1
3,4.084618,0.156478,0.946514,0.046767,10.0,0.001,"{'C': 10, 'tol': 0.001}",0.783471,0.792321,0.784854,0.783362,0.775221,0.783846,0.005433,7
4,3.707623,0.146584,0.903296,0.086834,10.0,0.1,"{'C': 10, 'tol': 0.1}",0.785479,0.794425,0.788296,0.785467,0.777385,0.78621,0.005492,4
5,2.092335,0.100873,0.52412,0.070022,10.0,1.0,"{'C': 10, 'tol': 1}",0.802528,0.794658,0.787479,0.806612,0.782462,0.794748,0.009001,1
6,4.184953,0.330972,0.956286,0.162735,5.0,0.001,"{'C': 5, 'tol': 0.001}",0.783471,0.792321,0.784854,0.783362,0.775221,0.783846,0.005433,7
7,3.298766,0.408519,0.84196,0.105925,5.0,0.1,"{'C': 5, 'tol': 0.1}",0.785479,0.794425,0.786207,0.785467,0.777385,0.785793,0.005396,6
8,1.981279,0.180946,0.527091,0.042,5.0,1.0,"{'C': 5, 'tol': 1}",0.802528,0.794658,0.787479,0.806612,0.782462,0.794748,0.009001,1
9,3.825034,0.213924,0.902639,0.081434,1.0,0.001,"{'C': 1, 'tol': 0.001}",0.769748,0.75485,0.748663,0.745098,0.753153,0.754302,0.008446,11


##### LSTM

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import layers
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM
from keras import metrics
from keras.callbacks import EarlyStopping
import keras.backend as K

In [None]:
# tokenization to sequences (order is perserved)
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(training_sources0)

training_sources4 = tokenizer.texts_to_sequences(training_sources0)
dev_sources4 = tokenizer.texts_to_sequences(dev_sources0)
test_sources4 = tokenizer.texts_to_sequences(test_sources0)

In [None]:
# lengths = [len(s) for s in training_sources4]
# print(max(lengths))
# print(min(lengths))
# pd.Series(lengths).hist()

In [None]:
# padding => each sent has the same length
maxlen = 25
training_sources5 = pad_sequences(training_sources4, padding="post", maxlen=maxlen)
dev_sources5 = pad_sequences(dev_sources4, padding="post", maxlen=maxlen)
test_sources5 = pad_sequences(test_sources4, padding="post", maxlen=maxlen)

In [None]:
training_sources5.max()

11273

In [None]:
vocab_size = len(tokenizer.word_counts)+2
embedding_dim = 128

In [None]:
#taken from old keras source code
def get_f1(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
lstm_nn = Sequential(name="lstm")
lstm_nn.add(layers.Embedding(input_dim = vocab_size,
                            output_dim=embedding_dim,
                            input_length=maxlen))
lstm_nn.add(LSTM(128))
lstm_nn.add(layers.Dense(128, activation="relu", name="FC1"))
lstm_nn.add(layers.Dropout(0.5))
lstm_nn.add(layers.Dense(1, activation='sigmoid', name="FC2"))
lstm_nn.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=[metrics.AUC(), "acc"],)

In [None]:
lstm_nn.fit(training_sources5, 
            train_labels, 
            shuffle=True,
            epochs=10, 
            verbose=True, 
            validation_data=(dev_sources5, dev_labels),
            batch_size = 128,
            callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.01)] 
            )

Epoch 1/10
Epoch 2/10
Epoch 3/10


<tensorflow.python.keras.callbacks.History at 0x7f3993046650>

In [None]:
preds = lstm_nn.predict(test_sources5)
preds = [("rumour" if pred>0.5 else "non-rumour") for pred in preds]
preds_dict = dict(zip(test_ids, preds))

with open("sample_data/lstm_test.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


In [None]:
lstm_nn.summary()

Model: "lstm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 25, 128)           1443072   
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               131584    
_________________________________________________________________
FC1 (Dense)                  (None, 128)               16512     
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
FC2 (Dense)                  (None, 1)                 129       
Total params: 1,591,297
Trainable params: 1,591,297
Non-trainable params: 0
_________________________________________________________________


##### BERT

In [None]:
!pip install torch torchvision transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 18.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 42.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 48.5MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertModel
from transformers import BertTokenizer

In [None]:
# define the dataset class
class TwitrerDataset(Dataset):
  def __init__(self, X, y, maxlen):
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.maxlen = maxlen
    self.X = X
    self.y = y 

  def __len__(self):
    return len(self.y)

  def __getitem__(self, index):
    # selecting the sentence and label at the specific index
    sent = self.X[index]
    label = self.y[index]

    # preprocessing the text to be suitable for BERT
    tokens = self.tokenizer.tokenize(sent)
    tokens = ['[CLS]'] + tokens + ['[SEP']       # insert CLS and SEP token
    if len(tokens) < self.maxlen:                # keep the same length of each sentence
      tokens = tokens + ['[PAD]' for _ in range(self.maxlen-len(tokens))]
    else:
      tokens = tokens[:self.maxlen-1] + ['SEP']

    tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) # obtaining the indices of tokens in vocab
    tokens_ids_tensor = torch.tensor(tokens_ids)   

    attn_mask = (tokens_ids_tensor != 0).long()        # attention mask (identity where is padded)
    
    return tokens_ids_tensor, attn_mask, label

In [None]:
# hyperparameters
batch_size = 32
num_worders = 2
lr = 2e-5
maxlen = 30

In [None]:
# creating instances of training and dev set
train_set = TwitrerDataset(training_sources0, train_labels, maxlen=maxlen)
dev_set = TwitrerDataset(dev_sources0, dev_labels, maxlen=maxlen)

# creating dataset loader
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_worders)
dev_loader = DataLoader(dev_set, batch_size=batch_size, num_workers=num_worders)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [None]:
class RumourClassifier(nn.Module):

  def __init__(self):
    super(RumourClassifier, self).__init__()
    self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
    self.cls_layer = nn.Linear(768, 1)

  def forward(self, seq, attn_masks):
    '''
    Inputs:
      -seq: Tensor of shape [B, T] containing token ids of sequences
      -attn_masks: Tensor of shape [B, T] containing attention masks to be used
    '''

    # feed the input to bert model to obtain contextualized representation
    outputs = self.bert_layer(seq, attention_mask=attn_masks)
    cont_reps = outputs.last_hidden_state

    # obtaining the representation of [CLS] head
    cls_rep = cont_reps[:, 0]

    # feeding cls_rep into the classifier layer
    logits = self.cls_layer(cls_rep)
    
    return logits

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def get_precision_from_logits(logits, labels):
    probs = torch.sigmoid(logits)
    soft_probs = (probs > 0.5).long()
    # de = 0
    # nu = 0
    # for i in range(len(soft_probs)):
    #     if soft_probs[i] == 1:
    #         de += 1
    #         if labels[i] == 1:
    #           nu += 1
    # return nu / de
    labels_cpu = labels.cpu()
    soft_probs_cpu = soft_probs.cpu()
    return precision_recall_fscore_support(labels_cpu, soft_probs_cpu)[0][1]


def get_recall_from_logits(logits, labels):
    probs = torch.sigmoid(logits)
    soft_probs = (probs > 0.5).long()
    # de = 0
    # nu = 0
    # for i in range(len(labels)):
    #     if labels[i] == 1:
    #         de += 1
    #         if soft_probs[i] == 1:
    #           nu += 1
    # return nu / de
    labels_cpu = labels.cpu()
    soft_probs_cpu = soft_probs.cpu()
    return precision_recall_fscore_support(labels_cpu, soft_probs_cpu)[1][1]


def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss, mean_precision, mean_recall = 0, 0, 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            bs = labels.shape[0]
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()*bs
            mean_acc += get_accuracy_from_logits(logits, labels)*bs
            mean_precision += get_precision_from_logits(logits, labels)*bs
            mean_recall += get_recall_from_logits(logits, labels)*bs
            count += bs

    return mean_acc / count, mean_precision / count, mean_recall / count,mean_loss / count

In [None]:
def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                precision = get_precision_from_logits(logits, labels)
                recall = get_recall_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Precision: {}; Recall: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, precision, recall,(time.time()-st)))
                st = time.time()

        
        dev_acc, dev_precision, dev_recall,dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Precision: {}; Development Recall: {}; Development Loss: {}".format(ep, dev_acc, dev_precision, dev_recall, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
gpu = 0  # GPU id

bert_net = RumourClassifier()
bert_net.cuda(gpu) 
print("creating the rumour classifier: Done!") 

creating the rumour classifier: Done!


In [None]:
num_epoch = 1
criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(bert_net.parameters(), lr=lr)

# fine-tune the bert network
train(bert_net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.7238537073135376; Accuracy: 0.40625; Precision: 0.3333333333333333; Recall: 0.7272727272727273; Time taken (s): 0.4756500720977783
Iteration 100 of epoch 0 complete. Loss: 0.432309091091156; Accuracy: 0.875; Precision: 1.0; Recall: 0.6923076923076923; Time taken (s): 18.85442590713501
Epoch 0 complete! Development Accuracy: 0.8379310369491577; Development Precision: 0.6819441055012743; Development Recall: 0.9149200225062293; Development Loss: 0.35003519798147265
Best development accuracy improved from 0 to 0.8379310369491577, saving model...


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
def predict(sent, maxlen=maxlen):
  tokens = tokenizer.tokenize(sent)
  tokens = tokens = ['[CLS]'] + tokens + ['[SEP']
  if len(tokens) < maxlen:                # keep the same length of each sentence
    tokens = tokens + ['[PAD]' for _ in range(maxlen-len(tokens))]
  else:
    tokens = tokens[:maxlen-1] + ['SEP']
  tokens_ids = tokenizer.convert_tokens_to_ids(tokens) # obtaining the indices of tokens in vocab
  tokens_ids_tensor = torch.tensor(tokens_ids).unsqueeze(0).cuda(gpu)

  attn_mask = (tokens_ids_tensor != 0).long().cuda(gpu)        # attention mask (identity where is padded)
  with torch.no_grad():
    prediction = bert_net(tokens_ids_tensor, attn_mask)
  return prediction

In [None]:
predictions = [predict(sent) for sent in test_sources0]

In [None]:
preds1 = [("rumour" if pred > 0.0 else "non-rumour") for pred in predictions]

In [None]:
preds_dict = dict(zip(test_ids, preds1))

with open("sample_data/bert.v2_test_epoch10.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


In [None]:
preds_dev = [(1 if p >0.0 else 0) for p in [predict(sent) for sent in dev_sources0]]
precision_recall_fscore_support(preds_dev, dev_labels)

(array([0.80152672, 0.9144385 ]),
 array([0.95166163, 0.68674699]),
 array([0.87016575, 0.78440367]),
 array([331, 249]))

In [None]:
get_recall_from_logits(torch.tensor(preds_dev), torch.tensor(dev_labels))

0.8074866310160428

In [None]:
count = 0
mean_acc, mean_precision, mean_recall = 0,0,0
for seq, attn_masks, labels in dev_loader:
    bs = (labels.shape[0])
    seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
    logits = bert_net(seq, attn_masks)
    mean_acc += get_accuracy_from_logits(logits, labels) * bs
    mean_precision += get_precision_from_logits(logits, labels) * bs
    mean_recall += get_recall_from_logits(logits, labels) * bs
    count += bs
print(mean_precision / count)
print(mean_recall / count)

0.759852943990875
0.8160694477935857


In [None]:
preds_train = [(1 if p >0.0 else 0) for p in [predict(sent) for sent in training_sources0]]
precision_recall_fscore_support(preds_train, train_labels)

(array([0.99444081, 0.99747315]),
 array([0.99868637, 0.98934837]),
 array([0.99655907, 0.99339415]),
 array([3045, 1596]))

##### Bert.v2

In [None]:
!pip install torch torchvision transformers

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

In [None]:
def to_date(date_str):
    return datetime.strftime(datetime.strptime(date_str,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')

training_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in training_data]
dev_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in dev_data]
test_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in test_data]

In [None]:
# remove url
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def extract_info(data, info="text"):
    res = []
    for i in range(len(data)):
        event = data[i]
        event_info = []
        for tw in event:
            event_info.append(remove_urls(tw[info]))
        res.append(event_info)
    return res

training_sents = extract_info(training_data)     # {event}  where event={source,apply1,apply2,...}
dev_sents = extract_info(dev_data)
test_sents = extract_info(test_data)

In [None]:
def combine_replies(replies):
    res = ""
    for r in replies:
        res += r
    return res

training_all0 = [[event[0], combine_replies(event[1:])] for event in training_sents]
dev_all0 = [[event[0], combine_replies(event[1:])] for event in dev_sents]
test_all0 = [[event[0], combine_replies(event[1:])] for event in test_sents]

In [None]:
# define the dataset class
class TwitrerDataset4(Dataset):
  def __init__(self, X, y, source_maxlen, reply_maxlen):
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.source_maxlen = source_maxlen
    self.reply_maxlen = reply_maxlen
    self.X = X
    self.y = y 

  def __len__(self):
    return len(self.y)

  def __getitem__(self, index):
    # selecting the sentence and label at the specific index
    # sent = self.X[index]
    source, replies = self.X[index]
    label = self.y[index]

    # preprocessing the text to be suitable for BERT
    s_tokens = self.tokenizer.tokenize(source)
    s_tokens = ['[CLS]'] + s_tokens + ['[SEP']       # insert CLS and SEP token
    if len(s_tokens) < self.source_maxlen:                # keep the same length of each sentence
      s_tokens = s_tokens + ['[PAD]' for _ in range(self.source_maxlen-len(s_tokens))]
    else:
      s_tokens = s_tokens[:self.source_maxlen-1] + ['SEP']

    r_tokens = self.tokenizer.tokenize(replies)
    r_tokens = r_tokens + ['[SEP']
    if len(r_tokens) < self.reply_maxlen:                # keep the same length of each sentence
      r_tokens = r_tokens + ['[PAD]' for _ in range(self.reply_maxlen-len(r_tokens))]
    else:
      r_tokens = r_tokens[:self.reply_maxlen-1] + ['SEP']


    tokens_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + self.tokenizer.convert_tokens_to_ids(r_tokens) # obtaining the indices of tokens in vocab
    tokens_ids_tensor = torch.tensor(tokens_ids)   

    attn_mask = (tokens_ids_tensor != 0).long()        # attention mask (identity where is padded)

    token_type_ids = torch.tensor([0 for _ in range(source_maxlen)]+[1 for _ in range(reply_maxlen)])
    
    return tokens_ids_tensor, attn_mask, token_type_ids, label


In [None]:
# hyperparameters
batch_size = 128
num_worders = 2
lr = 2e-5
source_maxlen, reply_maxlen = 30, 30

# creating instances of training and dev set
train_set = TwitrerDataset4(training_all0, train_labels, source_maxlen=source_maxlen, reply_maxlen=reply_maxlen)
dev_set = TwitrerDataset4(dev_all0, dev_labels, source_maxlen=source_maxlen, reply_maxlen=reply_maxlen)

# creating dataset loader
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_worders)
dev_loader = DataLoader(dev_set, batch_size=batch_size, num_workers=num_worders)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [None]:
class RumourClassifier4(nn.Module):

  def __init__(self):
    super(RumourClassifier4, self).__init__()
    self.bert_layer = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    self.cls_layer = nn.Linear(768, 1)

  def forward(self, seq, attn_masks, token_type_ids):
    '''
    Inputs:
      -seq: Tensor of shape [B, T] containing token ids of sequences
      -attn_masks: Tensor of shape [B, T] containing attention masks to be used
    '''
    # feed the input to bert model to obtain contextualized representation
    outputs = self.bert_layer(seq, attention_mask=attn_masks, token_type_ids=token_type_ids)
    logits = outputs.logits
    return logits

In [None]:
def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, token_type_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), token_type_ids.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks, token_type_ids)
            #Computing loss
            loss = criterion(logits.view(-1,2), labels.long().view(-1))

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 10 == 0:
                print("Iteration {} of epoch {} complete. Loss: {};  Time taken (s): {}".format(it, ep, loss.item(), (time.time()-st)))
                st = time.time()

        
        dev_precision, dev_recall,dev_loss = evaluate(net, criterion, dev_loader, gpu)
        dev_f1 = (2*dev_precision*dev_recall/(dev_precision+dev_recall))
        print("*****Epoch {} complete! Development f1-score: {}; Development Precision: {}; Development Recall: {}; Development Loss: {}".format(ep, dev_f1, dev_precision, dev_recall, dev_loss))
        if dev_f1 > best_f1:
            print("Best development f1-score improved from {} to {}, saving model...".format(best_f1, dev_f1))
            best_f1 = dev_f1
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).long().mean()
    return acc



def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss, mean_precision, mean_recall = 0, 0, 0, 0
    count = 0
    tn, fp, fn, tp = 0, 0, 0, 0
    with torch.no_grad():
        for seq, attn_masks, token_type_ids, labels in dataloader:
            bs = labels.shape[0]
            seq, attn_masks, token_type_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), token_type_ids.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.long()).item()*bs
            # mean_acc += get_accuracy_from_logits(logits, labels)*bs

            probs = torch.sigmoid(logits)
            # soft_probs = (probs > 0.5).long()
            soft_probs = [(1 if a < b else 0) for a,b in probs]
            labels_cpu, soft_probs_cpu = labels.cpu(), soft_probs.cpu()
            tn_, fp_, fn_, tp_ = confusion_matrix(labels_cpu, soft_probs_cpu).ravel()
            tn += tn_
            fp += fp_
            fn += fn_
            tp += tp_
            count += bs
    return  tp / (tp+fp), tp / (tp+fn), mean_loss / count

In [None]:
gpu = 0  # GPU id

bert_net4 = RumourClassifier4()
bert_net4.cuda(gpu) 
print("creating the rumour classifier4: Done!") 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

creating the rumour classifier4: Done!


In [None]:
num_epoch = 1
criterion = nn.CrossEntropyLoss()
opti = optim.Adam(bert_net4.parameters(), lr=lr)

# fine-tune the bert network
train(bert_net4, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

RuntimeError: ignored

In [None]:
# def predict2(sent, maxlen=25):
#   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#   tokens = tokenizer.tokenize(sent)
#   tokens = tokens = ['[CLS]'] + tokens + ['[SEP']
#   if len(tokens) < maxlen:                # keep the same length of each sentence
#     tokens = tokens + ['[PAD]' for _ in range(maxlen-len(tokens))]
#   else:
#     tokens = tokens[:maxlen-1] + ['SEP']
#   tokens_ids = tokenizer.convert_tokens_to_ids(tokens) # obtaining the indices of tokens in vocab
#   tokens_ids_tensor = torch.tensor(tokens_ids).unsqueeze(0).cuda(gpu)

#   attn_mask = (tokens_ids_tensor != 0).long().cuda(gpu)        # attention mask (identity where is padded)
#   with torch.no_grad():
#     prediction = bert_net2(tokens_ids_tensor, attn_mask)
#   return prediction.view(2)

In [None]:
def predict3(model, X, source_maxlen=source_maxlen, reply_maxlen=reply_maxlen):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  source, replies = X
  s_tokens = tokenizer.tokenize(source)
  s_tokens = ['[CLS]'] + s_tokens + ['[SEP']       # insert CLS and SEP token
  if len(s_tokens) < source_maxlen:                # keep the same length of each sentence
    s_tokens = s_tokens + ['[PAD]' for _ in range(source_maxlen-len(s_tokens))]
  else:
    s_tokens = s_tokens[:source_maxlen-1] + ['SEP']

  r_tokens = tokenizer.tokenize(replies)
  r_tokens = ['[CLS]'] + r_tokens + ['[SEP']
  if len(r_tokens) < reply_maxlen:                # keep the same length of each sentence
    r_tokens = r_tokens + ['[PAD]' for _ in range(reply_maxlen-len(r_tokens))]
  else:
    r_tokens = r_tokens[:reply_maxlen-1] + ['SEP']

  tokens_ids = tokenizer.convert_tokens_to_ids(s_tokens) + tokenizer.convert_tokens_to_ids(r_tokens) # obtaining the indices of tokens in vocab
  tokens_ids_tensor = torch.tensor(tokens_ids).unsqueeze(0)  

  attn_mask = (tokens_ids_tensor != 0).long()        # attention mask (identity where is padded)
  
  token_type_ids = torch.tensor([0 for _ in range(source_maxlen)]+[1 for _ in range(reply_maxlen)])
  with torch.no_grad():
    tokens_ids_tensor, attn_mask, token_type_ids = tokens_ids_tensor.cuda(gpu), attn_mask.cuda(gpu), token_type_ids.cuda(gpu)
    prediction = model(tokens_ids_tensor, attn_mask, token_type_ids)
  return prediction.view(2)

In [None]:
# preds_dev = [(1 if p >0.0 else 0) for p in [predict2(sent) for sent in dev_sources0]]

print([predict2(dev_sources0[i]) for i in range(10)])


[(1 if b>a else 0) for a,b in [predict2(dev_sources0[i]) for i in range(10)]]

[1, 0, 0, 1, 0, 0, 0, 0, 1, 0]

In [None]:
# dev_preds = []

# for i in range(len(dev_sources0)):
#     print(i)
#     p0, p1 = predict2(dev_sources0[i])
#     if p1 > p0:
#         dev_preds.append(1)
#     else:
#         dev_preds.append(0)

In [None]:
# test_preds = []

# for i in range(len(dev_sources0)):
#     print(i)
#     p0, p1 = predict2(test_sources0[i])
#     if p1 > p0:
#         test_preds.append(1)
#     else:
#         test_preds.append(0)

In [None]:
preds_dict = dict(zip(test_ids, test_preds))

with open("sample_data/bert.v2_test.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

storing file finish


#### source + reply (bert)

In [None]:
path = "drive/MyDrive/Colab/"
training_data = [json.loads(event) for event in open(path+'project-data/train.data.jsonl', "r").readlines()]
dev_data = [json.loads(event) for event in open(path+'project-data/dev.data.jsonl', "r").readlines()]
test_data = [json.loads(event) for event in open(path+'project-data/test.data.jsonl', "r").readlines()]

train_labels = json.load(open(path+'project-data/train.label.json', "r"))
dev_labels = json.load(open(path+'project-data/dev.label.json', "r"))

train_labels = np.array([(1 if train_labels[id_str]=='rumour' else 0) for id_str in train_labels])
dev_labels = np.array([(1 if dev_labels[id_str]=='rumour' else 0) for id_str in dev_labels])

test_ids = [test_data[i][0]["id_str"] for i in range(len(test_data))]

In [None]:
# sort by date for each event
def to_date(date_str):
    return datetime.strftime(datetime.strptime(date_str,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')

training_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in training_data]
dev_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in dev_data]
test_data_sort = [sorted(event, key=lambda x : to_date(x["created_at"])) for event in test_data]

In [None]:
# sort by user's followers_count
training_data_sort1 = [training_data[0]] + [sorted(event, key=lambda x : x['user']['followers_count'], reverse=True) for event in training_data[1:]]
dev_data_sort1 = [dev_data[0]] + [sorted(event, key=lambda x : x['user']['followers_count'], reverse=True) for event in dev_data[1:]]
test_data_sort1 = [test_data[0]] + [sorted(event, key=lambda x : x['user']['followers_count'], reverse=True) for event in test_data[1:]]

In [None]:
# remove url
def remove_urls(vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def remove_ats(vTEXT):
    vTEXT = re.sub(r'@[^\s]* ', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def extract_info(data, info="text"):
    res = []
    for i in range(len(data)):
        event = data[i]
        event_info = []
        for tw in event:
            event_info.append(remove_ats(tw[info]))
            # event_info.append(remove_ats(remove_urls(tw[info])))
        res.append(event_info)
    return res

training_sents = extract_info(training_data_sort)     # {event}  where event={source,apply1,apply2,...}
dev_sents = extract_info(dev_data_sort)
test_sents = extract_info(test_data_sort)

In [None]:
def combine_replies(replies):
    res = ""
    for r in replies:
        res += r
    return res

training_all0 = [[event[0], combine_replies(event[1:])] for event in training_sents]
dev_all0 = [[event[0], combine_replies(event[1:])] for event in dev_sents]
test_all0 = [[event[0], combine_replies(event[1:])] for event in test_sents]

In [None]:
!pip install torch torchvision transformers

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

In [None]:
# define the dataset class
class TwitrerDataset3(Dataset):
  def __init__(self, X, y, source_maxlen, reply_maxlen):
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.source_maxlen = source_maxlen
    self.reply_maxlen = reply_maxlen
    self.X = X
    self.y = y 

  def __len__(self):
    return len(self.y)

  def __getitem__(self, index):
    # selecting the sentence and label at the specific index
    # sent = self.X[index]
    source, replies = self.X[index]
    label = self.y[index]

    # preprocessing the text to be suitable for BERT
    s_tokens = self.tokenizer.tokenize(source)
    s_tokens = ['[CLS]'] + s_tokens + ['[SEP']       # insert CLS and SEP token
    if len(s_tokens) < self.source_maxlen:                # keep the same length of each sentence
      s_tokens = s_tokens + ['[PAD]' for _ in range(self.source_maxlen-len(s_tokens))]
    else:
      s_tokens = s_tokens[:self.source_maxlen-1] + ['SEP']

    r_tokens = self.tokenizer.tokenize(replies)
    r_tokens = r_tokens + ['[SEP']
    if len(r_tokens) < self.reply_maxlen:                # keep the same length of each sentence
      r_tokens = r_tokens + ['[PAD]' for _ in range(self.reply_maxlen-len(r_tokens))]
    else:
      r_tokens = r_tokens[:self.reply_maxlen-1] + ['SEP']


    tokens_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + self.tokenizer.convert_tokens_to_ids(r_tokens) # obtaining the indices of tokens in vocab
    tokens_ids_tensor = torch.tensor(tokens_ids)   

    attn_mask = (tokens_ids_tensor != 0).long()        # attention mask (identity where is padded)

    token_type_ids = torch.tensor([0 for _ in range(source_maxlen)]+[1 for _ in range(reply_maxlen)])
    
    return tokens_ids_tensor, attn_mask, token_type_ids, label

In [None]:
# hyperparameters
batch_size = 64
num_worders = 2
lr = 2e-5
source_maxlen, reply_maxlen = 30, 30

In [None]:
# creating instances of training and dev set
train_set = TwitrerDataset3(training_all0, train_labels, source_maxlen=source_maxlen, reply_maxlen=reply_maxlen)
dev_set = TwitrerDataset3(dev_all0, dev_labels, source_maxlen=source_maxlen, reply_maxlen=reply_maxlen)

# creating dataset loader
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_worders)
dev_loader = DataLoader(dev_set, batch_size=batch_size, num_workers=num_worders)

print("Done preprocessing training and development data.")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


Done preprocessing training and development data.


In [None]:
class RumourClassifier3(nn.Module):

  def __init__(self):
    super(RumourClassifier3, self).__init__()
    self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
    self.cls_layer = nn.Linear(768, 1)
    self.dropout = nn.Dropout(0.9)

  def forward(self, seq, attn_masks, token_type_ids):
    '''
    Inputs:
      -seq: Tensor of shape [B, T] containing token ids of sequences
      -attn_masks: Tensor of shape [B, T] containing attention masks to be used
    '''
    # feed the input to bert model to obtain contextualized representation
    outputs = self.bert_layer(seq, attention_mask=attn_masks, token_type_ids=token_type_ids)
    cont_reps = outputs.last_hidden_state

    # obtaining the representation of [CLS] head
    cls_rep = cont_reps[:, 0]

    # pooled_output = self.dropout(cls_rep)

    # feeding cls_rep into the classifier layer
    logits = self.cls_layer(cls_rep)
    
    return logits

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

# def get_precision_from_logits(logits, labels):
#     probs = torch.sigmoid(logits)
#     soft_probs = (probs > 0.5).long()
#     labels_cpu = labels.cpu()
#     soft_probs_cpu = soft_probs.cpu()
#     return precision_recall_fscore_support(labels_cpu, soft_probs_cpu)[0][1]


# def get_recall_from_logits(logits, labels):
#     probs = torch.sigmoid(logits)
#     soft_probs = (probs > 0.5).long()
#     labels_cpu = labels.cpu()
#     soft_probs_cpu = soft_probs.cpu()
#     return precision_recall_fscore_support(labels_cpu, soft_probs_cpu)[1][1]


def evaluate(net, criterion, dataloader, gpu):
    net.eval()
    mean_acc, mean_loss, mean_precision, mean_recall = 0, 0, 0, 0
    count = 0
    tn, fp, fn, tp = 0, 0, 0, 0
    with torch.no_grad():
        for seq, attn_masks, token_type_ids, labels in dataloader:
            bs = labels.shape[0]
            seq, attn_masks, token_type_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), token_type_ids.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()*bs
            mean_acc += get_accuracy_from_logits(logits, labels)*bs

            probs = torch.sigmoid(logits)
            soft_probs = (probs > 0.5).long()
            labels_cpu, soft_probs_cpu = labels.cpu(), soft_probs.cpu()
            tn_, fp_, fn_, tp_ = confusion_matrix(labels_cpu, soft_probs_cpu).ravel()
            tn += tn_
            fp += fp_
            fn += fn_
            tp += tp_
            count += bs

    return mean_acc / count, tp / (tp+fp), tp / (tp+fn), mean_loss / count

In [None]:
def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_f1 = 0
    st = time.time()
    for ep in range(max_eps):
        print()
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, token_type_ids, labels = seq.cuda(gpu), attn_masks.cuda(gpu), token_type_ids.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks, token_type_ids)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; acc: {}; Time taken (s): {}".format(it, ep, loss.item(), acc,(time.time()-st)))
                st = time.time()
        
        dev_acc, dev_precision, dev_recall,dev_loss = evaluate(net, criterion, dev_loader, gpu)
        dev_f1 = (2*dev_precision*dev_recall/(dev_precision+dev_recall))
        print("*****Epoch {} complete! Development f1-score: {}; Development Precision: {}; Development Recall: {}; Development Loss: {}".format(ep, dev_f1, dev_precision, dev_recall, dev_loss))
        if dev_f1 > best_f1:
            print("Best development f1-score improved from {} to {}, saving model...".format(best_f1, dev_f1))
            best_f1 = dev_f1
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
gpu = 0  # GPU id

bert_net3 = RumourClassifier3()
bert_net3.cuda(gpu) 
print("creating the rumour classifier: Done!") 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


creating the rumour classifier: Done!


In [None]:
# sort by date, reply_max = 30, pool, just remove ats
num_epoch = 8
criterion = nn.BCEWithLogitsLoss()
# opti = optim.Adam(bert_net3.parameters(), lr=lr)
opti = AdamW(bert_net3.parameters(), lr=lr, weight_decay=0.01)

# fine-tune the bert network
train(bert_net3, criterion, opti, train_loader, dev_loader, num_epoch, gpu)


Iteration 0 of epoch 0 complete. Loss: 0.6819823980331421; acc: 0.59375; Time taken (s): 1.6413025856018066
*****Epoch 0 complete! Development f1-score: 0.7806122448979593; Development Precision: 0.7463414634146341; Development Recall: 0.8181818181818182; Development Loss: 0.34832867055103695
Best development f1-score improved from 0 to 0.7806122448979593, saving model...

Iteration 0 of epoch 1 complete. Loss: 0.3537675738334656; acc: 0.8125; Time taken (s): 48.426135778427124
*****Epoch 1 complete! Development f1-score: 0.7701149425287356; Development Precision: 0.8322981366459627; Development Recall: 0.7165775401069518; Development Loss: 0.32124878332532686

Iteration 0 of epoch 2 complete. Loss: 0.22718462347984314; acc: 0.890625; Time taken (s): 49.46395969390869
*****Epoch 2 complete! Development f1-score: 0.8048192771084337; Development Precision: 0.7324561403508771; Development Recall: 0.893048128342246; Development Loss: 0.3357803616030463
Best development f1-score improved f

In [None]:
# sort by date, reply_max = 50
num_epoch = 10
criterion = nn.BCEWithLogitsLoss()
# opti = optim.Adam(bert_net3.parameters(), lr=lr)
opti = AdamW(bert_net3.parameters(), lr=lr, weight_decay=0.01)

# fine-tune the bert network
train(bert_net3, criterion, opti, train_loader, dev_loader, num_epoch, gpu)


Iteration 0 of epoch 0 complete. Loss: 0.7804609537124634; acc: 0.375; Time taken (s): 1.801375389099121
*****Epoch 0 complete! Development f1-score: 0.7788944723618091; Development Precision: 0.7345971563981043; Development Recall: 0.8288770053475936; Development Loss: 0.34324746707390097
Best development f1-score improved from 0 to 0.7788944723618091, saving model...

Iteration 0 of epoch 1 complete. Loss: 0.36333340406417847; acc: 0.8125; Time taken (s): 74.36376881599426
*****Epoch 1 complete! Development f1-score: 0.7851002865329513; Development Precision: 0.845679012345679; Development Recall: 0.732620320855615; Development Loss: 0.3160153450637028
Best development f1-score improved from 0.7788944723618091 to 0.7851002865329513, saving model...

Iteration 0 of epoch 2 complete. Loss: 0.22815145552158356; acc: 0.875; Time taken (s): 74.3490617275238
*****Epoch 2 complete! Development f1-score: 0.8040712468193385; Development Precision: 0.7669902912621359; Development Recall: 0.84

In [None]:
# sort by date, reply_max = 100
num_epoch = 10
criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(bert_net3.parameters(), lr=lr)

# fine-tune the bert network
train(bert_net3, criterion, opti, train_loader, dev_loader, num_epoch, gpu)


Iteration 0 of epoch 0 complete. Loss: 0.6802589893341064; acc: 0.640625; Time taken (s): 2.4174270629882812
*****Epoch 0 complete! Development f1-score: 0.772020725388601; Development Precision: 0.7487437185929648; Development Recall: 0.7967914438502673; Development Loss: 0.3357150692364265
Best development f1-score improved from 0 to 0.772020725388601, saving model...

Iteration 0 of epoch 1 complete. Loss: 0.3574814796447754; acc: 0.828125; Time taken (s): 120.77228260040283
*****Epoch 1 complete! Development f1-score: 0.7684964200477327; Development Precision: 0.6939655172413793; Development Recall: 0.8609625668449198; Development Loss: 0.3547134128110162

Iteration 0 of epoch 2 complete. Loss: 0.18147578835487366; acc: 0.921875; Time taken (s): 118.39115238189697
*****Epoch 2 complete! Development f1-score: 0.7133757961783439; Development Precision: 0.8818897637795275; Development Recall: 0.5989304812834224; Development Loss: 0.4090502911600573

Iteration 0 of epoch 3 complete. L

In [None]:
# sort by followers
num_epoch = 10
criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(bert_net3.parameters(), lr=lr)

# fine-tune the bert network
train(bert_net3, criterion, opti, train_loader, dev_loader, num_epoch, gpu)


Iteration 0 of epoch 0 complete. Loss: 0.6820484399795532; f1: 0.625; Precision: 0.5; Recall: 0.2916666666666667; Time taken (s): 1.787294626235962
*****Epoch 0 complete! Development f1-score: 0.7298850574712643; Development Precision: 0.7888198757763976; Development Recall: 0.679144385026738; Development Loss: 0.3668898031629365
Best development f1-score improved from 0 to 0.7298850574712643, saving model...

Iteration 0 of epoch 1 complete. Loss: 0.3754526972770691; f1: 0.828125; Precision: 0.782608695652174; Recall: 0.75; Time taken (s): 74.44451594352722
*****Epoch 1 complete! Development f1-score: 0.7783505154639175; Development Precision: 0.7512437810945274; Development Recall: 0.8074866310160428; Development Loss: 0.34727517983009076
Best development f1-score improved from 0.7298850574712643 to 0.7783505154639175, saving model...

Iteration 0 of epoch 2 complete. Loss: 0.2469254434108734; f1: 0.875; Precision: 0.7857142857142857; Recall: 0.9166666666666666; Time taken (s): 74.5

In [None]:
def predict3(model, X, source_maxlen=source_maxlen, reply_maxlen=reply_maxlen):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  source, replies = X
  s_tokens = tokenizer.tokenize(source)
  s_tokens = ['[CLS]'] + s_tokens + ['[SEP']       # insert CLS and SEP token
  if len(s_tokens) < source_maxlen:                # keep the same length of each sentence
    s_tokens = s_tokens + ['[PAD]' for _ in range(source_maxlen-len(s_tokens))]
  else:
    s_tokens = s_tokens[:source_maxlen-1] + ['SEP']

  r_tokens = tokenizer.tokenize(replies)
  r_tokens = ['[CLS]'] + r_tokens + ['[SEP']
  if len(r_tokens) < reply_maxlen:                # keep the same length of each sentence
    r_tokens = r_tokens + ['[PAD]' for _ in range(reply_maxlen-len(r_tokens))]
  else:
    r_tokens = r_tokens[:reply_maxlen-1] + ['SEP']

  tokens_ids = tokenizer.convert_tokens_to_ids(s_tokens) + tokenizer.convert_tokens_to_ids(r_tokens) # obtaining the indices of tokens in vocab
  tokens_ids_tensor = torch.tensor(tokens_ids).unsqueeze(0)  

  attn_mask = (tokens_ids_tensor != 0).long()        # attention mask (identity where is padded)
  
  token_type_ids = torch.tensor([0 for _ in range(source_maxlen)]+[1 for _ in range(reply_maxlen)])
  with torch.no_grad():
    tokens_ids_tensor, attn_mask, token_type_ids = tokens_ids_tensor.cuda(gpu), attn_mask.cuda(gpu), token_type_ids.cuda(gpu)
    prediction = model(tokens_ids_tensor, attn_mask, token_type_ids)
  return prediction

In [None]:
# # preds_dev = [(1 if p >0.0 else 0) for p in [predict3(bert_net3, source_reply) for source_reply in dev_all0]]
# preds_dev = []
# for i in range(len(dev_all0)):
#   if i%100==0:
#     print(i)
#   pred = predict3(bert_net3, dev_all0[i])
#   pred_label = 1 if pred>0 else 0
#   preds_dev.append(pred_label)

# precision_recall_fscore_support(preds_dev, dev_labels)

In [None]:
evaluate(bert_net3, criterion, dev_loader, gpu)

(tensor(0.8862, device='cuda:0'),
 0.8457142857142858,
 0.7914438502673797,
 0.5531535880319003)

In [None]:
bert_net3.load_state_dict(torch.load("sstcls_4.dat"))
evaluate(bert_net3, criterion, dev_loader, gpu)

(tensor(0.8897, device='cuda:0'),
 0.8219895287958116,
 0.839572192513369,
 0.42821776949126145)

In [None]:
preds_dev = [(1 if p >0.0 else 0) for p in [predict3(bert_net3, source_reply) for source_reply in dev_all0]]
precision_recall_fscore_support(preds_dev, dev_labels)

(array([0.92875318, 0.81818182]),
 array([0.91478697, 0.84530387]),
 array([0.92171717, 0.83152174]),
 array([399, 181]))

In [None]:
evaluate(bert_net3, criterion, dev_loader, gpu)

(tensor(0.8897, device='cuda:0'),
 0.8219895287958116,
 0.839572192513369,
 0.42821776949126145)

In [None]:
def f1(p, r):
  return 2*p*r/(p+r)

print(f1(0.8219895287958116,0.839572192513369,))

# date, 30 30, only remove ats: 0.83069
# date 50: 0.81517
# date 100: 0. 8179
# date 50 adam: 0.8189415

0.8306878306878307


In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

3516

In [None]:
predictions = [predict3(bert_net3, sent) for sent in test_all0]
preds1 = [("rumour" if pred > 0.0 else "non-rumour") for pred in predictions]
preds_dict = dict(zip(test_ids, preds1))

with open("sample_data/bert.v1_test_adamw_30_sourcereply.predict.json","w") as f:
    json.dump(preds_dict,f)
    print("storing file finish")

In [None]:
preds1 = [(1 if pred > 0.0 else 0) for pred in predictions]
precision_recall_fscore_support(dev_labels, preds1)

(array([0.91478697, 0.84530387]),
 array([0.92875318, 0.81818182]),
 array([0.92171717, 0.83152174]),
 array([393, 187]))