In [83]:
import os
from typing import Tuple, List
from functools import partial

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.autograd import Variable
from transformers import AutoTokenizer, AutoConfig, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm
from pathlib import Path
import logging
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
import itertools
import json
import argparse
import math
import random
import sys

torch.manual_seed(0)
np.random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
sys.path.append('./code')

from model import HierachyVAE
from read_data import *
from utils import *
from parsed_args import args

In [84]:
data_path = './data/borrow/'
with open(data_path + 'labeled_data.pkl', 'rb') as f:
    labeled_data = pickle.load(f)
    # {mid: sentences, labels}
with open(data_path + 'unlabeled_data.pkl', 'rb') as f:
    unlabeled_data = pickle.load(f)
    # {mid: message}
with open(data_path + 'mid2target.pkl', 'rb') as f:
    mid2target = pickle.load(f)
    # {mid: target, team_size}

with open(data_path + 'label_mapping.pkl', 'rb') as f:
    label_mapping = pickle.load(f)

In [86]:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
use_cuda = torch.cuda.is_available()
devices = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("gpu num: ", n_gpu)


if args.warm_up == 'False':
    args.warm_up = False
else:
    args.warm_up = True
    
    
if args.hard == 'False':
    args.hard = False
else:
    args.hard = True

train_labeled_dataset, train_unlabeled_dataset, test_dataset, val_dataset, vocab, n_labels, doc_labels = read_data(
    data_path=args.data_path, n_labeled_data=args.n_labeled_data, n_unlabeled_data=args.n_unlabeled_data, max_seq_num=args.max_seq_num, max_seq_len=args.max_seq_len, embedding_size=args.embedding_size)

val_loader = Data.DataLoader(
    dataset=val_dataset, batch_size=16, shuffle=False)
test_loader = Data.DataLoader(
    dataset=test_dataset, batch_size=16, shuffle=False)


gpu num:  1
{0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 0, 6: 0, 7: 0, 8: 4, 9: 5}
unk words:  9516
vocab size:  13944
#Labeled: 900, Unlabeled 48155, Val 400, Test 400, N class 6, 2


In [88]:
val_text = []
val_labels = []
val_mids = []
for batch_idx, (x, l, y, mask1, mask2, mask3, mask4, mid, sent_len, doc_len) in enumerate(val_loader):
    val_mids += mid
    for k in mid:
        val_text.append(" ".join([str(x) for x in all_txts_obj[k][0]]))
        val_labels.append(mid2target[k])

test_text = []
test_labels = []
test_mids = []
for batch_idx, (x, l, y, mask1, mask2, mask3, mask4, mid, sent_len, doc_len) in enumerate(test_loader):
    test_mids += mid
    for k in mid:
        test_text.append(" ".join([str(x) for x in all_txts_obj[k][0]]))
        test_labels.append(mid2target[k])
test_mids = set(test_mids)
val_mids = set(val_mids)

In [89]:
test_df = pd.DataFrame({"body": test_text, "label": test_labels})
val_df = pd.DataFrame({"body": val_text, "label": val_labels})

In [92]:
train_text = []
train_labels = []
for k in mid2target:
    if k not in test_mids and k not in val_mids:
        if type(all_txts_obj[k]) is list:
            train_text.append(" ".join([str(x) for x in all_txts_obj[k][0]]))
        else:
            train_text.append(all_txts_obj[k])
        train_labels.append(mid2target[k])

In [93]:
train_df = pd.DataFrame({"body": train_text, "label": train_labels})

In [12]:
train_df.to_csv("train_split.csv")
test_df.to_csv("test_split.csv")
val_df.to_csv("val_split.csv")

In [13]:
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

bert_model_name = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
assert tokenizer.pad_token_id == 0, "Padding value used in masks is set to zero, please change it everywhere"

In [15]:
class PersuasionDataset(Dataset):
    
    def __init__(self, tokenizer: AutoTokenizer, dataframe: pd.DataFrame, lazy: bool = False):
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id
        self.lazy = lazy
        if not self.lazy:
            self.X = []
            self.Y = []
            for i, (row) in tqdm(dataframe.iterrows()):
                x, y = self.row_to_tensor(self.tokenizer, row)
                self.X.append(x)
                self.Y.append(y)
        else:
            self.df = dataframe        
    
    @staticmethod
    def row_to_tensor(tokenizer: AutoTokenizer, row: pd.Series, idx):
        tokens = tokenizer.encode(str(row["body"]), add_special_tokens=True)
        if len(tokens) > 511:
            tokens = tokens[:511] + [tokens[-1]]
        x = torch.LongTensor(tokens)
        y = torch.FloatTensor([row["label"]])
        return x, y, idx
        
    
    def __len__(self):
        if self.lazy:
            return len(self.df)
        else:
            return len(self.X)

    def __getitem__(self, index: int) -> Tuple[torch.LongTensor, torch.LongTensor]:
        if not self.lazy:
            return self.X[index], self.Y[index], index
        else:
            return self.row_to_tensor(self.tokenizer, self.df.iloc[index], index) 
            

def collate_fn(batch) \
        -> Tuple[torch.LongTensor, torch.LongTensor]:
    x, y, index = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value=0)
    y = torch.stack(y)
    return x.cuda(), y.cuda(), index

train_dataset = PersuasionDataset(tokenizer, train_df, lazy=True)
dev_dataset = PersuasionDataset(tokenizer, val_df, lazy=True)
test_dataset = PersuasionDataset(tokenizer, test_df, lazy=True)
collate_fn = partial(collate_fn)
BATCH_SIZE = 8
train_sampler = RandomSampler(train_dataset)
dev_sampler = RandomSampler(dev_dataset)
test_sampler = RandomSampler(test_dataset)

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
dev_iterator = DataLoader(dev_dataset, batch_size=BATCH_SIZE, sampler=dev_sampler, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [16]:
config = AutoConfig.from_pretrained("bert-base-cased", output_attentions=True)

In [17]:
class BertClassifier(nn.Module):
    
    def __init__(self, bert: AutoModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                
            labels=None):
        distilbert_output = self.bert(input_ids,
                               attention_mask=attention_mask,

                               head_mask=head_mask)


        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.classifier(pooled_output) # batch, 6
        pooled_output = torch.sigmoid(pooled_output)
        criterion = nn.BCELoss()
        loss = 0
        if labels is not None:
            loss = criterion(pooled_output, labels)
        return loss, pooled_output, distilbert_output[-1]

model = BertClassifier(AutoModel.from_pretrained(bert_model_name, config=config), 1).cuda()

In [18]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
EPOCH_NUM = 10
# triangular learning rate, linearly grows untill half of first epoch, then linearly decays 
warmup_steps = 10 ** 3
total_steps = len(train_iterator) * EPOCH_NUM - warmup_steps
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)


In [19]:
def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for x, y, idx in tqdm(iterator, position=0, leave=True):
        optimizer.zero_grad()
        mask = (x != 0).float()
        loss, outputs, attn = model(x, attention_mask=mask, labels=y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Train loss {total_loss / len(iterator)}")


In [20]:
def evaluate(model, iterator, df):
    

    model.eval()
    total = 0
    correct = 0
    perms = 0
    total_perms = 0
    total_model_outs = []
    total_model_labels = []
    with torch.no_grad():
        total_loss = 0
        for x, y, idx in tqdm(iterator, position=0, leave=True):
            mask = (x != 0).float()
            loss, outputs, attn = model(x, attention_mask=mask, labels=y)
            total_loss += loss
            
            model_outputs = (outputs.squeeze() > .5).tolist()
            correct_labels = (y.squeeze() == 1).tolist()
            
            total_model_outs = total_model_outs + model_outputs
            total_model_labels = total_model_labels + correct_labels

    
    
    print(total_model_labels[:10])
    print(total_model_outs[:10])
    
    print("roc auc: ")
    print(roc_auc_score(total_model_labels, total_model_outs))
    print("recall: ")
    print(recall_score(total_model_labels, total_model_outs, average="macro"))
    print("prec: ")
    print(precision_score(total_model_labels, total_model_outs, average="macro"))
    print("f1: ")
    print(f1_score(total_model_labels, total_model_outs, average="macro"))
    print("accuracy: ")
    print(accuracy_score(total_model_labels, total_model_outs))
    print(f"Evaluate loss {total_loss / len(iterator)}")

In [28]:
torch.save(model.state_dict(), './bert_baseline_tmp.pkl')

In [39]:
evaluate(model, test_iterator, test_df)
evaluate(model, dev_iterator, val_df)

100%|██████████| 50/50 [00:01<00:00, 33.49it/s]
 10%|█         | 5/50 [00:00<00:01, 39.34it/s]

[False, True, True, True, True, True, False, True, False, True]
[False, True, True, True, False, True, True, False, True, True]
roc auc: 
0.643358395989975
recall: 
0.6433583959899749
prec: 
0.6439210950080516
f1: 
0.64349376114082
accuracy: 
0.645
Evaluate loss 0.9059433341026306


100%|██████████| 50/50 [00:01<00:00, 32.46it/s]

[False, False, False, True, False, False, True, False, False, True]
[False, True, False, False, False, True, False, False, True, False]
roc auc: 
0.6402243589743589
recall: 
0.640224358974359
prec: 
0.6422222222222222
f1: 
0.640050090931402
accuracy: 
0.6425
Evaluate loss 0.9702701568603516





In [None]:
for i in range(1):
    print('=' * 50, f"EPOCH {i}", '=' * 50)
    train(model, train_iterator, optimizer, scheduler)
    evaluate(model, dev_iterator, val_df)    
    evaluate(model, test_iterator, test_df)

In [29]:
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
f1s = []
ps = []
rs = []
corpus = train_df["body"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([str(x) for x in corpus])

for i in range(0, 100):
    

    clf = DummyClassifier(strategy="uniform").fit(X, train_df["label"])

    results = clf.predict(vectorizer.transform(test_df["body"]))

    total_model_outs = results
    total_model_labels = test_df["label"]

    f1s.append(f1_score(total_model_labels, total_model_outs, average="macro"))
    rs.append(recall_score(total_model_labels, total_model_outs, average="macro"))
    ps.append(precision_score(total_model_labels, total_model_outs, average="macro"))

In [110]:
from sklearn.naive_bayes import MultinomialNB
corpus = train_df["body"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([str(x) for x in corpus])
nb = MultinomialNB()

nb.fit(X, train_df["label"])


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [114]:
results = nb.predict(vectorizer.transform(test_df["body"]))

total_model_outs = results
total_model_labels = test_df["label"]

print("roc auc: ")
print(roc_auc_score(total_model_labels, total_model_outs))
print("recall: ")
print(recall_score(total_model_labels, total_model_outs, average="macro"))
print("prec: ")
print(precision_score(total_model_labels, total_model_outs, average="macro"))
print("f1: ")
print(f1_score(total_model_labels, total_model_outs, average="macro"))
print("accuracy: ")
print(accuracy_score(total_model_labels, total_model_outs))

results = nb.predict(vectorizer.transform(val_df["body"]))

total_model_outs = results
total_model_labels = val_df["label"]

print("roc auc: ")
print(roc_auc_score(total_model_labels, total_model_outs))
print("recall: ")
print(recall_score(total_model_labels, total_model_outs, average="macro"))
print("prec: ")
print(precision_score(total_model_labels, total_model_outs, average="macro"))
print("f1: ")
print(f1_score(total_model_labels, total_model_outs, average="macro"))
print("accuracy: ")
print(accuracy_score(total_model_labels, total_model_outs))


roc auc: 
0.6254385964912281
recall: 
0.6254385964912281
prec: 
0.6370519455625838
f1: 
0.6212292014248997
accuracy: 
0.6325
roc auc: 
0.6119791666666667
recall: 
0.6119791666666666
prec: 
0.62168376370711
f1: 
0.6071256620125179
accuracy: 
0.6175
