# Chapter 2: Modeling (Local)

In [1]:
import torch
from torch.utils.data import DataLoader
from torchdata.datapipes.iter import IterableWrapper, IterDataPipe
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm

In [2]:
BUCKET_NAME = 'xy-mp-pipeline'
OUTPUT_PATH = 'data/covid-csv'
N_SAMPLES = 19454
TRAIN_FILES = N_SAMPLES * 4 // 5 // 16 + 1
TEST_FILES = N_SAMPLES // 5 // 16 + 1
BATCH_SIZE = 16
TRAIN_S3_URL = f's3://{BUCKET_NAME}/{OUTPUT_PATH}/training/'
TEST_S3_URL = f's3://{BUCKET_NAME}/{OUTPUT_PATH}/testing/'
TEST_DATASET_SIZE = N_SAMPLES // 5
TRAIN_DATASET_SIZE = N_SAMPLES - TEST_DATASET_SIZE
MODEL_OUTPUT_PATH = 'assets/model'

### Create a data pipe
I'm sure most of PyTorch users are already familiar with Datasets, it is a convenient way to load data into memory. This time, we will use a different approach to load data into memory, which is called data pipe. Data pipe is a new feature introduced in PyTorch 1.8.0, it is a new way to load data into memory, and it is more flexible than Datasets. 

The main benefit for this particular project is that we can load data from a cloud bucket, one batch at a time, and we can also do some preprocessing on the fly. This is very useful when we have a large dataset and we don't want to load all the data into memory at once.

With this, we can also easily perform data parallel training, which is a very useful technique when we have a large dataset and we want to train our model faster.

**Recall the dataframe**
>
    RangeIndex: 19454 entries, 0 to 19453
    Data columns (total 22 columns):
    #   Column           Non-Null Count  Dtype 
    ---  ------           --------------  ----- 
    0   headlines        19454 non-null  object
    1   length           19454 non-null  int64 
    2   has_num          19454 non-null  bool  
    3   ner_percent      19454 non-null  int64 
    4   ner_quantity     19454 non-null  int64 
    5   ner_law          19454 non-null  int64 
    6   ner_person       19454 non-null  int64 
    7   ner_product      19454 non-null  int64 
    8   ner_gpe          19454 non-null  int64 
    9   ner_work_of_art  19454 non-null  int64 
    10  ner_date         19454 non-null  int64 
    11  ner_time         19454 non-null  int64 
    12  ner_cardinal     19454 non-null  int64 
    13  ner_org          19454 non-null  int64 
    14  ner_money        19454 non-null  int64 
    15  ner_language     19454 non-null  int64 
    16  ner_ordinal      19454 non-null  int64 
    17  ner_event        19454 non-null  int64 
    18  ner_loc          19454 non-null  int64 
    19  ner_fac          19454 non-null  int64 
    20  ner_norp         19454 non-null  int64 
    21  outcome          19454 non-null  int64 
    dtypes: bool(1), int64(20), object(1)
    memory usage: 3.1+ MB

For this model, we want to pass the text into a pre-trained BERT model, at the same time, we also want to pass the other features into a fully connected layer. To do this, we need to create a custom data pipe that gives the input to BERT, tabular features, and the label.

In [63]:
class TextDataset(IterDataPipe):
    def __init__(self, s3_urls, tokenizer, num_files):
        super().__init__()
        self.tokenizer = tokenizer
        self.url_wrapper = s3_urls
        self.num_files = num_files

    def __iter__(self):
        for _, file in self.url_wrapper.load_files_by_s3():
            temp = pd.read_csv(file)
            label = torch.from_numpy(temp['outcome'].values)

            headlines = temp.headlines.values

            # Tabular features
            tabular_input = [torch.from_numpy(temp[col].values).to(torch.float32).squeeze() for col in temp.columns if col not in ['outcome', 'headlines']]
            yield headlines, tabular_input, label

    def __len__(self):
        return self.num_files


            # # For BERT model
            # bert_input = []
            # embedded = [self.tokenizer(t, padding='max_length', max_length=100, truncation=True, return_tensors='pt') for t in temp['headlines']]
            # bert_input.append(torch.cat([e['input_ids'] for e in embedded], dim=0))
            # bert_input.append(torch.cat([e['attention_mask'] for e in embedded], dim=0))


In [77]:
from transformers import BertModel

class FakeNewsClassifier(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout_1 = torch.nn.Dropout(0.25)
        self.linear = torch.nn.Linear(768, 12)
        self.dropout_2 = torch.nn.Dropout(0.25)
        self.final_linear = torch.nn.Linear(32, 1)
        self.relu = torch.nn.ReLU()
        self.normalize = torch.nn.functional.normalize
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, bert_input: dict, tabular_input: list):
        _, pooled_output = self.bert(**bert_input)
        dropout_1_output = self.dropout_1(pooled_output)
        linear_output = self.linear(dropout_1_output)
        relu_output = self.relu(linear_output)
        norm1 = self.normalize(relu_output, p=2, dim=1)
        norm2 = self.normalize(tabular_input, p=2, dim=1)
        combined_output = torch.cat([norm1, norm2], dim=1)
        dropout_2_output = self.dropout_2(combined_output)
        final_output = self.final_linear(dropout_2_output)
        return self.sigmoid(final_output)
    

This is what the model looks like

![alt text](images/model_architecture.png "Model Architecture")

In [78]:
def train_model(model: torch.nn.Module, train_data_url: str, test_data_url: str, train_len: int, test_len:int, train_file_len: int, test_file_len: int, epochs: int, lr: float):
    # Prepare dataloaders
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    train_s3_url = IterableWrapper([train_data_url]).list_files_by_s3().shuffle().sharding_filter()
    test_s3_url = IterableWrapper([test_data_url]).list_files_by_s3().shuffle().sharding_filter()

    train_df = TextDataset(train_s3_url, tokenizer, train_file_len)
    test_df = TextDataset(test_s3_url, tokenizer, test_file_len)

    train_loader = DataLoader(train_df, batch_size=1, shuffle=True)
    test_loader = DataLoader(test_df, batch_size=1, shuffle=True)

    # Config device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Config optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_function = torch.nn.BCELoss()

    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Train
    for epoch in range(epochs):
        training_loss = 0.0
        training_acc = 0.0

        for bert_input, tabular_input, label in tqdm(train_loader):
            bert_input = {
                'input_ids': bert_input[0].squeeze().to(device),
                'attention_mask': bert_input[1].squeeze().to(device),
                'return_dict': False
            }
            tabular_input = torch.cat(tabular_input).T.to(device)
            label = label.T.to(device)

            output = model(bert_input, tabular_input)

            loss = loss_function(output, label.float())
            training_loss += loss.item()

            # get acc of signmoid output
            acc = (output[0].round() == label).sum().item()
            training_acc += acc

            model.zero_grad()
            loss.backward()
            optimizer.step()

        validation_loss = 0.0
        validation_acc = 0.0

        with torch.no_grad():
            for bert_input, tabular_input, label in test_loader:
                bert_input = {
                    'input_ids': bert_input[0].squeeze().to(device),
                    'attention_mask': bert_input[1].squeeze().to(device),
                    'return_dict': False
                }
                tabular_input = torch.cat(tabular_input).T.to(device)
                label = label.T.to(device)

                output = model(bert_input, tabular_input)

                loss = loss_function(output, label.float())
                validation_loss += loss.item()

                # get acc of signmoid output
                acc = (output[0].round() == label).sum().item()
                validation_acc += acc
        print(f'Epoch: {epoch+1}/{epochs} | Training loss: {training_loss/train_len:.3f} | Training acc: {training_acc/train_len:.3f} | Validation loss: {validation_loss/test_len:.3f} | Validation acc: {validation_acc/test_len:.3f}')

In [79]:
model = FakeNewsClassifier()
EPOCHS = 5
LR = 5e-6


train_model(
    model, 
    TRAIN_S3_URL, 
    TEST_S3_URL, 
    TRAIN_DATASET_SIZE, 
    TEST_DATASET_SIZE, 
    TRAIN_FILES, 
    TEST_FILES,
    EPOCHS, 
    LR
)

import os
if not os.path.exists(MODEL_OUTPUT_PATH):
    os.makedirs(MODEL_OUTPUT_PATH)

torch.save(model.state_dict(), MODEL_OUTPUT_PATH + '/baseline.pth')
print('Model saved to:  ', MODEL_OUTPUT_PATH)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 973/973 [40:09<00:00,  2.48s/it]


Epoch: 1/5 | Training loss: 0.035 | Training acc: 0.849 | Validation loss: 0.036 | Validation acc: 0.825


100%|██████████| 973/973 [40:34<00:00,  2.50s/it]


Epoch: 2/5 | Training loss: 0.033 | Training acc: 0.916 | Validation loss: 0.035 | Validation acc: 0.850


100%|██████████| 973/973 [40:58<00:00,  2.53s/it]


Epoch: 3/5 | Training loss: 0.033 | Training acc: 0.936 | Validation loss: 0.035 | Validation acc: 0.883


100%|██████████| 973/973 [40:32<00:00,  2.50s/it]


Epoch: 4/5 | Training loss: 0.032 | Training acc: 0.951 | Validation loss: 0.035 | Validation acc: 0.862


100%|██████████| 973/973 [40:08<00:00,  2.48s/it]


Epoch: 5/5 | Training loss: 0.032 | Training acc: 0.954 | Validation loss: 0.033 | Validation acc: 0.941


RuntimeError: Parent directory assets/model does not exist.

In [87]:

if not os.path.exists(MODEL_OUTPUT_PATH):
    os.makedirs(MODEL_OUTPUT_PATH)
torch.save(model.state_dict(),MODEL_OUTPUT_PATH + '/baseline.pth')

In [84]:
MODEL_OUTPUT_PATH + '/baseline.pth'

'assets/model/baseline.pth'