### Check if GPU is available.

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch

In [3]:
from utils import *

### Load data

In [4]:
!pwd

/Users/roccc/Desktop/Projects/jpnb/[202304]谣言可解释


In [5]:
set_workspace('/Users/roccc/Desktop/Projects/rumor_detection')

workspace: /Users/roccc/Desktop/Projects/rumor_detection


In [6]:
events_nodes_text = open_json('./data/pheme-2/events_nodes_text_dict.json')
events_label = open_json('./data/pheme-2/events_label.json')

In [7]:
events_root_text = dict(map(lambda x: (x[0], x[1]['0']), events_nodes_text.items()))
events_label = dict(map(lambda item: (item[0], 0 if item[1] is None else 1), events_label.items()))

In [8]:
import pandas as pd
events_df = pd.concat([pd.Series(events_root_text), pd.Series(events_label)], axis=1)
events_df.columns = ['sentences','label']  

In [9]:
len(events_nodes_text)

6425

In [10]:
events_df.label.value_counts()

0    4023
1    2402
Name: label, dtype: int64

In [11]:
events_df.label = events_df.label.astype(int)

In [12]:
sentences = events_df.sentences
labels = events_df.label

### Tokenizing

In [13]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [14]:
import re
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [15]:
from transformers.models.auto.configuration_auto import model_type_to_module_name
def preprocessing_for_bert(data):
    """Perform required process for pretrained bert.
    @param data (np.array): array of text to be processed.
    @return input_ids (torch.Tensor): Tensor of token ids to be fed into a model.
    @return attention_masks (torch.Tensor): Tensor of indices specifying which tokens should be attended by the model_type_to_module_name 
    """
    input_ids = []
    attention_masks = []

    for sentence in data:
        # 1) Tokenize the sentence
        # 2) Add special token [CLS], [SEP]
        # 3) Truncate or Pad to max length
        # 4）Map tokens to ids
        # 5) create a attention masks
        # 6) return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sentence),
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_attention_mask=True
        )

        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [16]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  73


In [17]:
MAX_LEN = 128

In [18]:
input_ids, attention_masks = preprocessing_for_bert(sentences)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [67]:
labels = torch.tensor(labels, dtype=torch.float32)

### Train valid split

In [20]:
def label_distribution(dataset,num_classes=2):
    classes_distribution = dict()
    for i in range(num_classes):
        ratio = round(sum(list(map(lambda x: x[2]==i, dataset))).numpy() / len(dataset), 6)
        classes_distribution.update({i:ratio})

    return classes_distribution

In [90]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

In [82]:
import random
def random_selector(dataset, num):
    length = len(dataset)
    idx = random.sample(dataset, num)
    
    return idx

In [89]:
dataset

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.])

In [62]:
torch.save(dataset, 'data/pheme_text_dataset.pt')

In [23]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

### Model

In [24]:
import torch.nn as nn
from transformers import BertModel

In [25]:
class BertEncoder(nn.Module):
    def __init__(self, freeze_bert=False):
        super().__init__()
    
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]

        return last_hidden_state_cls

In [70]:
class Classifier(nn.Module):
    def __init__(self, input_channels, hidden_channels, num_classes):
        super().__init__()
        self.input_channels = input_channels
        self.hidden_channels = hidden_channels
        self.num_classes = num_classes 

        self.lin1 = nn.Linear(self.input_channels, self.hidden_channels)
        self.lin2 = nn.Linear(self.hidden_channels, self.num_classes)
    
    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = self.lin2(x)
        x = x.relu()
        return x

In [71]:
class TextClassifier(nn.Module):
    def __init__(self, input_channels=768, hidden_channels=256, num_classes=2, freeze_bert=False):
        super().__init__()
        self.encoder = BertEncoder(freeze_bert=freeze_bert)
        self.classifier = Classifier(input_channels, hidden_channels, num_classes)

    def forward(self, input_ids, attention_mask):
        x = self.encoder(input_ids, attention_mask)
        x = self.classifier(x)
        return x

#### Optimizer, lr scheduler

In [28]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [29]:
from sklearn.metrics import precision_recall_fscore_support

### Train and Test

In [30]:
import random
import time
import numpy as np
from tqdm import tqdm

In [31]:
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [63]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    print('Training started...')

    total_loss, batch_loss, batch_counts = 0, 0, 0
    for batch in tqdm(dataloader, desc='Iteration'):
        batch_counts += 1
        # data.to(device)
        optimizer.zero_grad()

        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        output = model(b_input_ids,b_attn_mask)

        loss = criterion(output.flatten(), b_labels)
        # batch_loss += loss.item()
        total_loss += loss.item()
        # print(f"{batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

        loss.backward()
        optimizer.step()
        # scheduler.step()
    avg_train_loss = total_loss / len(dataloader)
    print(f"Avg Train Loss: {avg_train_loss:^12.6f}")


In [74]:
def evaluate(model, dataloader,optimizer, criterion, device):
    model.eval()
    total_loss = 0
    y_true = []
    y_preds = []
    for batch in dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            output = model(b_input_ids,b_attn_mask)
        loss = criterion(output.flatten(), b_labels)
        total_loss += loss.item()

        preds = (output > 0.5).flatten()
        y_true += b_labels.tolist()
        y_preds += preds.tolist()
    print(y_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_preds, average='weighted')
    val_loss = total_loss / len(dataloader)
    val_acc = (np.array(y_preds)==np.array(y_true)).mean()

    return val_loss, val_acc, precision, recall, f1


In [75]:
def pipeline(dataset, test_proportion, num_classes=1, batch_size=32, num_epochs=4, seed=0, print_per_epoch=1, dataset_name=0):
    device = set_device()
    set_seed(seed)
    # split dataset
    # Calculate the number of samples to include in each set.
    val_size = int(test_proportion * len(dataset))
    train_size = len(dataset) - val_size

    # Divide the dataset by randomly selecting samples.
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))
    print(f'Train label distribution: {label_distribution(train_dataset,2)}')
    # dataloader
    batch_size = batch_size

    train_dataloader = DataLoader(
                train_dataset,  # The training samples.
                sampler = RandomSampler(train_dataset), # Select batches randomly
                batch_size = batch_size # Trains with this batch size.
            )

    validation_dataloader = DataLoader(
                val_dataset, # The validation samples.
                sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
                batch_size = batch_size # Evaluate with this batch size.
            )
    # model
    text_classifier = TextClassifier(num_classes=num_classes, freeze_bert=False)
    text_classifier.to(device)

    optimizer = AdamW(text_classifier.parameters(),
                      lr=5e-5,
                      eps=1e-8
                      )
    
    # text_classifier, optimizer, scheduler = initialize_model(device, num_epochs=4)
    criterion = nn.BCELoss()
    
    # train and evaluate
    train_stats = []
    for epoch in range(1, num_epochs+1):
        t0 = time.time()
        train(text_classifier, train_dataloader, optimizer,criterion, device=device)
        t1 = time.time()
        train_loss, train_acc, _, _, _ = evaluate(text_classifier, train_dataloader, optimizer,criterion, device=device)
        val_loss, val_acc, precision, recall, f1 = evaluate(text_classifier, validation_dataloader, optimizer,criterion, device=device)
        t2 = time.time()
        train_stats.append(
            {
                "epoch": epoch,
                "train_loss": train_loss,
                "train_acc": train_acc,
                "val_loss": val_loss,
                "val_acc": val_acc,
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "train_time": t1 - t0,
                "eval_time": t2 - t1,
            }
        )
        if epoch % print_per_epoch == 0 or epoch == num_epochs-1:
            print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:04f}, Test Acc: {val_acc:04f},\n Precision: {precision:04f}, Recall: {recall:04f}, F1: {f1:04f}')
    save_as_json(train_stats, f'./stats/train_stats_{dataset_name}_{test_proportion}_{seed}_{num_epochs}.json')

In [76]:
pipeline(dataset, test_proportion=0.2, batch_size=32, num_epochs=4, seed=1, print_per_epoch=1,dataset_name='pheme-text')

No GPU is available, cpu will be used instead.
    3 training samples
    0 validation samples


TypeError: type numpy.ndarray doesn't define __round__ method

In [None]:
# for seed in range(5):
#     torch.cuda.empty_cache()
#     pipeline(dataset, test_proportion=0.999, batch_size=32, num_epochs=4, seed=seed+10, print_per_epoch=1, dataset_name='pheme-root-text')