In [1]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import preprocessor as p 
import re
import json
import wordninja
import random
import csv
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoModel, BertForMaskedLM, AdamW
from transformers import BertTokenizer, BertModel, AutoTokenizer, BertweetTokenizer



In [2]:
# Data Loading

def load_data(filename):

    filename = [filename]
    concat_text = pd.DataFrame()
    raw_text = pd.read_csv(filename[0],usecols=[0], encoding='ISO-8859-1')
    raw_label = pd.read_csv(filename[0],usecols=[2], encoding='ISO-8859-1')
    raw_target = pd.read_csv(filename[0],usecols=[1], encoding='ISO-8859-1')
    label = pd.DataFrame.replace(raw_label,['FAVOR','NONE','AGAINST'], [1,2,0])
    concat_text = pd.concat([raw_text, label, raw_target], axis=1)
    concat_text = concat_text[concat_text.Stance != 2]
    
    return(concat_text)

In [3]:
# Data Cleaning

def data_clean(strings, norm_dict):
    
    p.set_options(p.OPT.URL,p.OPT.EMOJI,p.OPT.RESERVED)
    clean_data = p.clean(strings)  # using lib to clean URL, emoji...
    clean_data = re.sub(r"#SemST", "", clean_data)
    clean_data = re.findall(r"[A-Za-z#@]+|[,.!?&/\<>=$]|[0-9]+",clean_data)
    clean_data = [[x.lower()] for x in clean_data]
    
    for i in range(len(clean_data)):
        if clean_data[i][0] in norm_dict.keys():
            clean_data[i][0] = norm_dict[clean_data[i][0]]
            continue
        if clean_data[i][0].startswith("#") or clean_data[i][0].startswith("@"):
            clean_data[i] = wordninja.split(clean_data[i][0]) # split compound hashtags
    clean_data = [j for i in clean_data for j in i]

    return clean_data


In [4]:
# Clean All Data

def clean_all(filename, norm_dict):
    
    concat_text = load_data(filename)
    raw_data = concat_text['Tweet'].values.tolist() 
    label = concat_text['Stance'].values.tolist()
    x_target = concat_text['Target'].values.tolist()
    clean_data = [None for _ in range(len(raw_data))]
    
    for i in range(len(raw_data)):
        clean_data[i] = data_clean(raw_data[i], norm_dict)
        x_target[i] = data_clean(x_target[i], norm_dict)
    
    return clean_data,label,x_target

In [19]:
# Tokenization

def convert_data_to_ids(tokenizer, target, text, window = 512, stride = 256):
    
    input_ids, seg_ids, attention_masks, sent_len, sample_map = [], [], [], [], []
    # max_len = max(
    # len(tokenizer.encode(' '.join(tar), ' '.join(sent), add_special_tokens=True))
    # for tar, sent in zip(target, text)
    # )
    for doc_idx, (tar, sent) in enumerate(zip(target, text)):
        enc = tokenizer.encode_plus(
                            ' '.join(tar),                  # Target to encode
                            ' '.join(sent),                 # Sentence to encode
                            add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                            max_length = window,               # Pad & truncate all sentences
                            padding = 'max_length',
                            truncation = 'only_second', 
                            stride = stride, 
                            return_overflowing_tokens=True,
                            return_attention_mask = True,   # Construct attention masks
                       )

        # Add the encoded sentence to the list.    
        n_chunks = len(enc["input_ids"])
        input_ids.append(enc['input_ids'])
        seg_ids.append(enc['token_type_ids'])
        attention_masks.append(enc['attention_mask'])
        sent_len.extend([sum(enc['attention_mask'])])
        sample_map.extend([doc_idx] * n_chunks)
    
    return input_ids, seg_ids, attention_masks, sent_len, sample_map
    
def data_helper_bert(x_train_all,x_val_all,x_test_all,model_select):
    
    print('Loading data')
    
    x_train,y_train,x_train_target = x_train_all[0],x_train_all[1],x_train_all[2]                                                
    x_val,y_val,x_val_target = x_val_all[0],x_val_all[1],x_val_all[2]
    x_test,y_test,x_test_target = x_test_all[0],x_test_all[1],x_test_all[2]
                                                         
    print("Length of x_train: %d, the sum is: %d"%(len(x_train), sum(y_train)))
    print("Length of x_val: %d, the sum is: %d"%(len(x_val), sum(y_val)))
    print("Length of x_test: %d, the sum is: %d"%(len(x_test), sum(y_test)))
    
    # get the tokenizer
    if model_select == 'Bertweet':
        tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
    elif model_select == 'Bert':
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
        
    # tokenization
    x_train_input_ids, x_train_seg_ids, x_train_atten_masks, x_train_len = \
                    convert_data_to_ids(tokenizer, x_train_target, x_train)
    x_val_input_ids, x_val_seg_ids, x_val_atten_masks, x_val_len = \
                    convert_data_to_ids(tokenizer, x_val_target, x_val)
    x_test_input_ids, x_test_seg_ids, x_test_atten_masks, x_test_len = \
                    convert_data_to_ids(tokenizer, x_test_target, x_test)
#     print(x_test_input_ids[0])
    x_train_all = [x_train_input_ids,x_train_seg_ids,x_train_atten_masks,y_train,x_train_len]
    x_val_all = [x_val_input_ids,x_val_seg_ids,x_val_atten_masks,y_val,x_val_len]
    x_test_all = [x_test_input_ids,x_test_seg_ids,x_test_atten_masks,y_test,x_test_len]
    
    return x_train_all,x_val_all,x_test_all

In [3]:
# BERT/BERTweet

class stance_classifier(nn.Module):

    def __init__(self,num_labels,model_select):

        super(stance_classifier, self).__init__()
        
        self.dropout = nn.Dropout(0.)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
        if model_select == 'Bertweet':
            self.bert = AutoModel.from_pretrained("vinai/bertweet-base")
        elif model_select == 'Bert':
            self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.linear = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.out = nn.Linear(self.bert.config.hidden_size, num_labels)
        
    def forward(self, x_input_ids, x_seg_ids, x_atten_masks, x_len):
        
        last_hidden = self.bert(input_ids=x_input_ids, \
                                attention_mask=x_atten_masks, token_type_ids=x_seg_ids, \
                               )
        
        query = last_hidden[0][:,0]
        query = self.dropout(query)
        
        linear = self.relu(self.linear(query))
        out = self.out(linear)
        
        return out

In [7]:
# Evaluation

def compute_f1(preds, y):
    
    rounded_preds = F.softmax(preds)
    _, indices = torch.max(rounded_preds, 1)
                
    correct = (indices == y).float()
    acc = correct.sum()/len(correct) # compute accuracy
    
    y_pred = np.array(indices.cpu().numpy())
    y_true = np.array(y.cpu().numpy())
    result = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0,1])
#     print(result[2][0],result[2][1])
    f1_average = (result[2][0]+result[2][1])/2 # average F1 score of Favor and Against
        
    return acc, f1_average, result[0], result[1]

In [4]:
# Main 

def data_loader(x_all, batch_size, data_type):
    
    x_input_ids = torch.tensor(x_all[0], dtype=torch.long).cuda()
    x_seg_ids = torch.tensor(x_all[1], dtype=torch.long).cuda()
    x_atten_masks = torch.tensor(x_all[2], dtype=torch.long).cuda()
    # y = torch.tensor(x_all[3], dtype=torch.long).cuda()
    x_len = torch.tensor(x_all[3], dtype=torch.long).cuda()

    tensor_loader = TensorDataset(x_input_ids,x_seg_ids,x_atten_masks,x_len)
    if data_type == 'train':
        data_loader = DataLoader(tensor_loader, shuffle=True, batch_size=batch_size)
    else:
        data_loader = DataLoader(tensor_loader, shuffle=False, batch_size=batch_size)

    return x_input_ids, x_seg_ids, x_atten_masks, x_len, data_loader
    
def sep_test_set(input_data):
    
    # split the combined test set for Trump, Biden and Bernie
    data_list = [input_data[:777], input_data[777:1522], input_data[1522:2157]]
    
    return data_list

def run_classifier(input_word_pair,model_select,train_mode):
    
    random_seeds = [0,1,14,15,16,17,19]
    target_word_pair = input_word_pair
    
    #Creating Normalization Dictionary
    with open("../source/noslang_data.json", "r") as f:
        data1 = json.load(f)
    data2 = {}
    with open("../source/emnlp_dict.txt","r") as f:
        lines = f.readlines()
        for line in lines:
            row = line.split('\t')
            data2[row[0]] = row[1].rstrip()
    normalization_dict = {**data1,**data2}

    for target_index in range(len(target_word_pair)):
        best_result, best_val = [], []
        for seed in random_seeds:    
            print("current random seed: ", seed)

            if train_mode == "unified":
                filename1 = '../Dataset/raw_train_all.csv'
                filename2 = '../Dataset/raw_val_all.csv'
                filename3 = '../Dataset/raw_test_all.csv'
            elif train_mode == "adhoc":
                filename1 = '../Dataset/raw_train_'+target_word_pair[target_index]+'.csv'
                filename2 = '../Dataset/raw_val_'+target_word_pair[target_index]+'.csv'
                filename3 = '../Dataset/raw_test_'+target_word_pair[target_index]+'.csv'
            x_train,y_train,x_train_target = clean_all(filename1, normalization_dict)
            x_val,y_val,x_val_target = clean_all(filename2, normalization_dict)
            x_test,y_test,x_test_target = clean_all(filename3, normalization_dict)
                
            num_labels = len(set(y_train))
#             print(x_train_target[0])
            x_train_all = [x_train,y_train,x_train_target]
            x_val_all = [x_val,y_val,x_val_target]
            x_test_all = [x_test,y_test,x_test_target]
            
            # set up the random seed
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed) 

            # prepare for model
            x_train_all,x_val_all,x_test_all = data_helper_bert(x_train_all,x_val_all,x_test_all,model_select)
#             print(x_test_all[0][0])
            x_train_input_ids, x_train_seg_ids, x_train_atten_masks, y_train, x_train_len, trainloader = \
                                        data_loader(x_train_all, batch_size, 'train')
            x_val_input_ids, x_val_seg_ids, x_val_atten_masks, y_val, x_val_len, valloader = \
                                        data_loader(x_val_all, batch_size, 'val')                            
            x_test_input_ids, x_test_seg_ids, x_test_atten_masks, y_test, x_test_len, testloader = \
                                        data_loader(x_test_all, batch_size, 'test')

            model = stance_classifier(num_labels,model_select).cuda()

            for n,p in model.named_parameters():
                if "bert.embeddings" in n:
                    p.requires_grad = False
            optimizer_grouped_parameters = [
                {'params': [p for n, p in model.named_parameters() if n.startswith('bert.encoder')] , 'lr': lr},
                {'params': [p for n, p in model.named_parameters() if n.startswith('bert.pooler')] , 'lr': 1e-3},
                {'params': [p for n, p in model.named_parameters() if n.startswith('linear')], 'lr': 1e-3},
                {'params': [p for n, p in model.named_parameters() if n.startswith('out')], 'lr': 1e-3}
                ]
            
            loss_function = nn.CrossEntropyLoss(reduction='sum')
            optimizer = AdamW(optimizer_grouped_parameters)
            
            sum_loss = []
            sum_val = []
            train_f1_average = []
            val_f1_average = []
            if train_mode == "unified":
                test_f1_average = [[] for i in range(3)]
            elif train_mode == "adhoc":
                test_f1_average = [[]]

            for epoch in range(0, total_epoch):
                print('Epoch:', epoch)
                train_loss, valid_loss = [], []
                model.train()
                for input_ids,seg_ids,atten_masks,target,length in trainloader:
                    optimizer.zero_grad()
                    output1 = model(input_ids, seg_ids, atten_masks, length)
                    loss = loss_function(output1, target)
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), 1)
                    optimizer.step()
                    train_loss.append(loss.item())
                sum_loss.append(sum(train_loss)/len(x_train))  
                print(sum_loss[epoch])

                # evaluation on dev set
                model.eval()
                val_preds = []
                with torch.no_grad():
                    for input_ids,seg_ids,atten_masks,target,length in valloader: 
                        pred1 = model(input_ids, seg_ids, atten_masks, length) 
                        val_preds.append(pred1)
                pred1 = torch.cat(val_preds, 0)
                acc, f1_average, precision, recall = compute_f1(pred1,y_val)
                val_f1_average.append(f1_average)
                
                # evaluation on test set
                with torch.no_grad():
                    test_preds = []
                    for input_ids,seg_ids,atten_masks,target,length in testloader:
                        pred1 = model(input_ids, seg_ids, atten_masks, length)
                        test_preds.append(pred1)
                    pred1 = torch.cat(test_preds, 0)
                    if train_mode == "unified":
                        pred1_list = sep_test_set(pred1)
                        y_test_list = sep_test_set(y_test)
                    else:
                        pred1_list = [pred1]
                        y_test_list = [y_test]
                        
                    for ind in range(len(y_test_list)):
                        pred1 = pred1_list[ind]
                        acc, f1_average, precision, recall = compute_f1(pred1,y_test_list[ind])
                        test_f1_average[ind].append(f1_average)
            
            best_epoch = [index for index,v in enumerate(val_f1_average) if v == max(val_f1_average)][-1] 
            best_result.append([f1[best_epoch] for f1 in test_f1_average])

            print("******************************************")
            print("dev results with seed {} on all epochs".format(seed))
            print(val_f1_average)
            best_val.append(val_f1_average[best_epoch])
            print("******************************************")
            print("test results with seed {} on all epochs".format(seed))
            print(test_f1_average)
            print("******************************************")
        
        # model that performs best on the dev set is evaluated on the test set
        print("model performance on the test set: ")
        print(best_result)

In [21]:
# run classifier in unified setting

lr = 2e-5
batch_size = 32
total_epoch = 3
run_classifier(['all'],'Bert','unified')

current random seed:  0


  label = pd.DataFrame.replace(raw_label,['FAVOR','NONE','AGAINST'], [1,2,0])
  label = pd.DataFrame.replace(raw_label,['FAVOR','NONE','AGAINST'], [1,2,0])
  label = pd.DataFrame.replace(raw_label,['FAVOR','NONE','AGAINST'], [1,2,0])


Loading data
Length of x_train: 17224, the sum is: 8347
Length of x_val: 2193, the sum is: 1052
Length of x_test: 2157, the sum is: 1032




Epoch: 0


                                                             

0.5676999343511563


  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)


Epoch: 1


                                                             

0.46931661238693373


  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)


Epoch: 2


                                                             

0.4475349178812121


  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)
  rounded_preds = F.softmax(preds)


******************************************
dev results with seed 0 on all epochs
[np.float64(0.7652703609227975), np.float64(0.7801855795305196), np.float64(0.7847084369826202)]
******************************************
test results with seed 0 on all epochs
[[np.float64(0.7629840848806366), np.float64(0.7522854925263518), np.float64(0.7663891709872255)], [np.float64(0.7661167532610231), np.float64(0.7673881625319476), np.float64(0.7623042505592841)], [np.float64(0.7895407050778865), np.float64(0.8029913055564519), np.float64(0.8014324855560189)]]
******************************************
model performance on the test set: 
[[np.float64(0.7663891709872255), np.float64(0.7623042505592841), np.float64(0.8014324855560189)]]
Saved model to ../Dataset/trained_model/all_seed0_epoch2.pt


In [59]:
import torch, pandas as pd
import os
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# target = "Donald Trump"
# target = "Bernie Sanders"
target = "Joe Biden"
folder_name = "election"

df = pd.read_csv("../Dataset/election.csv") 
df = df[["Tweet", "Time"]]                      

tokenizer   = BertTokenizer.from_pretrained("bert-base-uncased",
                                            do_lower_case=True)
LABEL_MAP   = {0: "AGAINST", 1: "FAVOR"}

tweets  = df["Tweet"].astype(str).str.split().tolist()
targets = [target.split()] * len(tweets)

ids, segs, masks, slen, sample_map = convert_data_to_ids(tokenizer, targets, tweets)
print(type(ids))          # list
print(type(ids[0]))       # list or tensor?
print(len(ids[0]))        # 15301  (way too large)
print(len(ids[1]))  
test_all = [ids, segs, masks, slen]     # (N,)

# dummy label vector so the dataset shape matches what your model expects
# dummy_labels = torch.zeros(len(input_ids), dtype=torch.long)
x_test_input_ids, x_test_seg_ids, x_test_atten_masks, x_test_len, testloader = data_loader(test_all, batch_size = 64, data_type='test')
# dataset = TensorDataset(input_ids,
#                         token_type,
#                         attn_mask,
#                         dummy_labels,          # dummy labels
#                         seq_len)
# loader  = DataLoader(dataset,
#                      sampler=SequentialSampler(dataset),
#                      batch_size=64,                     # any value
#                      shuffle=False)

model = stance_classifier(num_labels=len(LABEL_MAP), model_select="Bert")
# model.load_state_dict(torch.load("../Dataset/trained_model/all_seed0_epoch2.pt", map_location="cuda", weights_only=False))
ckpt = torch.load("../Dataset/trained_model/all_seed0_epoch2.pt", map_location="cuda", weights_only=False)
model_weight = ckpt["model_state_dict"]
model.load_state_dict(model_weight)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

pred = []
with torch.no_grad():
    for b_ids, b_segs, b_masks, b_len in testloader:
        b_ids, b_segs, b_masks, b_len = \
            (t.to(device) for t in (b_ids, b_segs, b_masks, b_len))
        logits = model(b_ids, b_segs, b_masks, b_len)     # forward()
        pred.append(logits.cpu())
        # pred.extend(logits.argmax(dim=-1).cpu().tolist())
logits_all  = torch.cat(pred, dim=0)  
doc_logits  = torch.zeros(len(df), logits_all.size(1))
for lgt, doc_idx in zip(logits_all, sample_map):
    doc_logits[doc_idx] += lgt
doc_logits /= torch.bincount(torch.tensor(sample_map)).unsqueeze(1)  # mean
pred = doc_logits.argmax(dim=-1).tolist()
df["Predicted_Stance"] = [LABEL_MAP[i] for i in pred]
out_cols = ["Tweet", "Time", "Predicted_Stance"]  
os.makedirs(f"final/{folder_name}/", exist_ok=True)
df.to_csv(f"final/{folder_name}/{target}_results.csv", index=False)
df.head()


<class 'list'>
<class 'list'>
512
512


Unnamed: 0,Tweet,Time,Predicted_Stance
0,"--- \n+++ \n@@ -2,31 +2,31 @@\n {{use American...",2020-11-04 18:01:34+00:00,FAVOR
1,"--- \n+++ \n@@ -1,44 +1,44 @@\n {{Current elec...",2020-11-04 05:27:59+00:00,AGAINST
2,"--- \n+++ \n@@ -506,42 +506,42 @@\n ! style=""w...",2020-11-04 03:57:53+00:00,AGAINST
3,"--- \n+++ \n@@ -462,50 +462,55 @@\n | candida...",2020-11-04 15:55:19+00:00,AGAINST
4,"--- \n+++ \n@@ -2147,38 +2147,38 @@\n | candid...",2020-11-04 17:58:56+00:00,AGAINST
