# Chapter 2: Modeling (Local)

In [3]:
import torch
from torch.utils.data import DataLoader
from torchdata.datapipes.iter import IterableWrapper, IterDataPipe
from transformers import AutoTokenizer
import pandas as pd

from tqdm import tqdm
from transformers import logging


import boto3, json

In [4]:
logging.set_verbosity_error()

BUCKET_NAME = 'xy-mp-pipeline'
METADATA_KEY = 'data/covid-csv-metadata.json'
bucket = boto3.resource('s3').Bucket(BUCKET_NAME)
metadata = json.loads(bucket.Object(METADATA_KEY).get()['Body'].read())
input_schema = metadata['schema']['input_features']

In [5]:

OUTPUT_PATH = 'data/covid-csv'
N_SAMPLES = metadata['dataset_size']
TRAIN_FILES = N_SAMPLES * 4 // 5 // 16 + 1
TEST_FILES = N_SAMPLES // 5 // 16 + 1
BATCH_SIZE = metadata['batch_size']
TRAIN_S3_URL = f's3://{BUCKET_NAME}/{OUTPUT_PATH}/training/'
TEST_S3_URL = f's3://{BUCKET_NAME}/{OUTPUT_PATH}/testing/'
TEST_DATASET_SIZE = metadata['test_size']
TRAIN_DATASET_SIZE = metadata['train_size']
MODEL_OUTPUT_PATH = 'assets/model'

### Create a data pipe
I'm sure most of PyTorch users are already familiar with Datasets, it is a convenient way to load data into memory. This time, we will use a different approach to load data into memory, which is called data pipe. Data pipe is a new feature introduced in PyTorch 1.8.0, it is a new way to load data into memory, and it is more flexible than Datasets. 

The main benefit for this particular project is that we can load data from a cloud bucket, one batch at a time, and we can also do some preprocessing on the fly. This is very useful when we have a large dataset and we don't want to load all the data into memory at once.

With this, we can also easily perform data parallel training, which is a very useful technique when we have a large dataset and we want to train our model faster.

**Recall the dataframe**
>
    RangeIndex: 19454 entries, 0 to 19453
    Data columns (total 22 columns):
    #   Column           Non-Null Count  Dtype 
    ---  ------           --------------  ----- 
    0   headlines        19454 non-null  object
    1   length           19454 non-null  int64 
    2   has_num          19454 non-null  bool  
    3   ner_percent      19454 non-null  int64 
    4   ner_quantity     19454 non-null  int64 
    5   ner_law          19454 non-null  int64 
    6   ner_person       19454 non-null  int64 
    7   ner_product      19454 non-null  int64 
    8   ner_gpe          19454 non-null  int64 
    9   ner_work_of_art  19454 non-null  int64 
    10  ner_date         19454 non-null  int64 
    11  ner_time         19454 non-null  int64 
    12  ner_cardinal     19454 non-null  int64 
    13  ner_org          19454 non-null  int64 
    14  ner_money        19454 non-null  int64 
    15  ner_language     19454 non-null  int64 
    16  ner_ordinal      19454 non-null  int64 
    17  ner_event        19454 non-null  int64 
    18  ner_loc          19454 non-null  int64 
    19  ner_fac          19454 non-null  int64 
    20  ner_norp         19454 non-null  int64 
    21  outcome          19454 non-null  int64 
    dtypes: bool(1), int64(20), object(1)
    memory usage: 3.1+ MB

For this model, we want to pass the text into a pre-trained BERT model, at the same time, we also want to pass the other features into a fully connected layer. To do this, we need to create a custom data pipe that gives the input to BERT, tabular features, and the label.

In [6]:
class TextDataset(IterDataPipe):
    def __init__(self, s3_data_path, tokenizer, num_files):
        super().__init__()
        self.tokenizer = tokenizer
        self.url_wrapper = IterableWrapper([s3_data_path]).list_files_by_s3().shuffle().sharding_filter()
        self.num_files = num_files

    def __iter__(self):
        for _, file in self.url_wrapper.load_files_by_s3():
            temp = pd.read_csv(file)
            label = torch.from_numpy(temp['outcome'].values)
            # For BERT model
            bert_input = []
            embedded = [self.tokenizer(t, padding='max_length', max_length=100, truncation=True, return_tensors='pt') for t in temp['headlines']]
            bert_input.append(torch.cat([e['input_ids'] for e in embedded], dim=0))
            bert_input.append(torch.cat([e['attention_mask'] for e in embedded], dim=0))

            # Tabular features
            tabular_input = [torch.from_numpy(temp[col].values).to(torch.float32).squeeze() for col in temp.columns if col not in ['outcome', 'headlines']]
            yield bert_input, tabular_input, label

    def __len__(self):
        return self.num_files



Now, we can start defining our model that extends from a pre-trained BERT model. This will give us a model that can take in text input and output a vector representation of the text. We can then combine this with a fully connected layer that takes in the tabular features and the vector representation of the text, and output a prediction.

In [7]:
from transformers import BertModel

class FakeNewsClassifier(torch.nn.Module):
    def __init__(self, pretrained_model_name):
        super().__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout_1 = torch.nn.Dropout(0.25)
        self.linear = torch.nn.Linear(768, 12)
        self.dropout_2 = torch.nn.Dropout(0.25)
        self.final_linear = torch.nn.Linear(32, 1)
        self.relu = torch.nn.ReLU()
        self.normalize = torch.nn.functional.normalize
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, bert_input: dict, tabular_input: list):
        print(bert_input['input_ids'].shape, bert_input['attention_mask'].shape, tabular_input.shape)
        _, pooled_output = self.bert(**bert_input)
        dropout_1_output = self.dropout_1(pooled_output)
        linear_output = self.linear(dropout_1_output)
        relu_output = self.relu(linear_output)
        norm1 = self.normalize(relu_output, p=2, dim=1)
        norm2 = self.normalize(tabular_input, p=2, dim=1)
        combined_output = torch.cat([norm1, norm2], dim=1)
        dropout_2_output = self.dropout_2(combined_output)
        final_output = self.final_linear(dropout_2_output)
        return self.sigmoid(final_output)
    

This is what the model looks like

![alt text](images/model_architecture.png "Model Architecture")

In [8]:
def train_model(pretrained_model_name: str, train_data_url: str, test_data_url: str, train_len: int, test_len:int, train_file_len: int, test_file_len: int, epochs: int, lr: float):
    # Prepare dataloaders
    model = FakeNewsClassifier(pretrained_model_name)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

    train_df = TextDataset(train_data_url, tokenizer, train_file_len)
    test_df = TextDataset(test_data_url, tokenizer, test_file_len)

    train_loader = DataLoader(train_df, batch_size=1, shuffle=True)
    test_loader = DataLoader(test_df, batch_size=1, shuffle=True)

    # Config device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Config optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_function = torch.nn.BCELoss()

    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Train
    for epoch in range(epochs):
        training_loss = 0.0
        training_acc = 0.0

        for bert_input, tabular_input, label in tqdm(train_loader):
            bert_input = {
                'input_ids': bert_input[0].squeeze().to(device),
                'attention_mask': bert_input[1].squeeze().to(device),
                'return_dict': False
            }
            tabular_input = torch.cat(tabular_input).T.to(device)
            label = label.T.to(device)

            output = model(bert_input, tabular_input)

            loss = loss_function(output, label.float())
            training_loss += loss.item()

            # get acc of signmoid output
            acc = (output[0].round() == label).sum().item()
            training_acc += acc

            model.zero_grad()
            loss.backward()
            optimizer.step()

        validation_loss = 0.0
        validation_acc = 0.0

        with torch.no_grad():
            for bert_input, tabular_input, label in test_loader:
                bert_input = {
                    'input_ids': bert_input[0].squeeze().to(device),
                    'attention_mask': bert_input[1].squeeze().to(device),
                    'return_dict': False
                }
                tabular_input = torch.cat(tabular_input).T.to(device)
                label = label.T.to(device)

                output = model(bert_input, tabular_input)

                loss = loss_function(output, label.float())
                validation_loss += loss.item()

                # get acc of signmoid output
                acc = (output[0].round() == label).sum().item()
                validation_acc += acc
        print(f'Epoch: {epoch+1}/{epochs} | Training loss: {training_loss/train_len:.3f} | Training acc: {training_acc/train_len:.3f} | Validation loss: {validation_loss/test_len:.3f} | Validation acc: {validation_acc/test_len:.3f}')

In [10]:
pretrain_name = 'bert-base-uncased'
model = FakeNewsClassifier(pretrain_name)
EPOCHS = 1
LR = 5e-6

train_model(
    pretrain_name, 
    TRAIN_S3_URL, 
    TEST_S3_URL, 
    TRAIN_DATASET_SIZE, 
    TEST_DATASET_SIZE, 
    TRAIN_FILES, 
    TEST_FILES,
    EPOCHS, 
    LR
)

import os
if not os.path.exists(MODEL_OUTPUT_PATH):
    os.makedirs(MODEL_OUTPUT_PATH)

torch.save(model.state_dict(), MODEL_OUTPUT_PATH + '/baseline.pth')
print('Model saved to:  ', MODEL_OUTPUT_PATH)

100%|██████████| 973/973 [03:28<00:00,  4.67it/s]


Epoch: 1/1 | Training loss: 0.038 | Training acc: 0.871 | Validation loss: 0.038 | Validation acc: 0.883
Model saved to:   assets/model


### Load model and use it

In [9]:
import torch 
from transformers import AutoTokenizer
pretrain_name = 'bert-base-uncased'
# load model from output path
model = FakeNewsClassifier(pretrain_name)
model.load_state_dict(torch.load(MODEL_OUTPUT_PATH + '/baseline.pth'))

<All keys matched successfully>

### Measure F1 score from test set

In [6]:


# from sklearn.metrics import f1_score
# tokenizer = AutoTokenizer.from_pretrained(pretrain_name)
# test_ds = TextDataset(TEST_S3_URL, tokenizer, TEST_FILES)
# test_loader = DataLoader(test_ds, batch_size=1, shuffle=True)

# with torch.no_grad():
#     outputs = []
#     labels = []
#     for bert_input, tabular_input, label in tqdm(test_loader):
#         bert_input = {
#             'input_ids': bert_input[0].squeeze(),
#             'attention_mask': bert_input[1].squeeze(),
#             'return_dict': False
#         }
#         tabular_input = torch.cat(tabular_input).T
#         label = label.T

#         outputs.append(model(bert_input, tabular_input))
#         labels.append(label)

# output = torch.cat(outputs, dim=0)
# label = torch.cat(labels, dim=0)
# print(f'F1 score: {f1_score(output.round().detach().numpy(), label.detach().numpy())}')

100%|██████████| 244/244 [03:42<00:00,  1.10it/s]

F1 score: 0.8316017316017316





### Make inference with model and user input

In [16]:
import spacy
from collections import Counter
import re
nlp = spacy.load('en_core_web_sm')
tokenizer = AutoTokenizer.from_pretrained(pretrain_name)

def preprocess_headline(headline: str):
    tokens = tokenizer(headline, return_tensors='pt', padding='max_length', truncation=True, max_length=64)
    tokens = {k: v.squeeze() for k, v in tokens.items()}
    input_ids = tokens['input_ids'].unsqueeze(0)
    attention_mask = tokens['attention_mask'].unsqueeze(0)
    bert_input = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'return_dict': False
    }


    headline_len = len(headline.split())
    has_stats = int(re.match(r'\d', headline) is not None)

    ner_count = [f'ner_{ent.label_}' for ent in nlp(headline).ents]
    ner_count = Counter(ner_count)
    ner_input = get_ner_input(ner_count, input_schema)
    return bert_input, torch.cat([torch.tensor([headline_len, has_stats]), ner_input], dim=0).unsqueeze(0)

    
def get_ner_input(ner_count, input_schema):
    ner_features = [s for s in input_schema if s.startswith('ner_')]
    ner_input = torch.zeros(len(ner_features))
    for i, feat in enumerate(ner_features):
        ner_input[i] = ner_count.get(feat, 0)
    return ner_input
    

def classify(headline: str):
    bert_input, tabular_input = preprocess_headline(headline)
    with torch.no_grad():
        output = model(bert_input, tabular_input)
    return output.round().item()


In [17]:
# notebook text input ui
from ipywidgets import widgets
from IPython.display import display

text = widgets.Textarea(
    value='',
    placeholder='Type something',
    description='Headline:',
    disabled=False,
    layout={'width': '69%', 'height': '69px', 'display': 'flex', 'flex_flow': 'column wrap'}
)

button = widgets.Button(description="Fact Check", layout={'display': 'flex'})
out = widgets.Label(value='Click button to fact check headline!', layout={'display': 'flex', 'flex_flow': 'column wrap', 'align_items': 'flex-end'})

def fact_check(b):
    headline = text.value
    pred = classify(headline)
    
    if pred == 1:
        
        out.value = 'This is fake news!'
    else:
        out.value = 'This is real news!'

button.on_click(fact_check)

vb = widgets.Box([button, out])
display(text)
display(vb)

Textarea(value='', description='Headline:', layout=Layout(display='flex', flex_flow='column wrap', height='69p…

Box(children=(Button(description='Fact Check', layout=Layout(display='flex'), style=ButtonStyle()), Label(valu…