In [48]:
import torch
from torch.utils.data import DataLoader, Dataset
from torchdata.datapipes.iter import IterableWrapper, S3FileLoader
import numpy as np
import pandas as pd
import os

In [49]:
DATA_SOURCE = 'data/covid.zip'
BUCKET_NAME = 'xy-mp-pipeline'
OUTPUT_PATH = 'data/covid-csv'
N_SAMPLES = 10201
BATCHES = 32

# Preparing and uploading data to S3 bucket

In [50]:
df = pd.read_csv(DATA_SOURCE)
df.head()

Unnamed: 0,headlines,outcome
0,A post claims compulsory vacination violates t...,0
1,A photo claims that this person is a doctor wh...,0
2,Post about a video claims that it is a protest...,0
3,All deaths by respiratory failure and pneumoni...,0
4,The dean of the College of Biologists of Euska...,0


In [51]:
def pad_df(df, batches):
    len_positive = len(df[df['outcome'] == 1])
    len_negative = len(df[df['outcome'] == 0])

    pad_positve = batches - len_positive % batches
    pad_negative = batches - len_negative % batches

    df = pd.concat([df, df[df['outcome'] == 1].sample(n=pad_positve, replace=True)], axis=0)
    df = pd.concat([df, df[df['outcome'] == 1].sample(n=pad_negative, replace=True)], axis=0)
    return df

In [52]:
def train_test_split(df, test_size=0.2):
    positive = df[df['outcome'] == 1]
    negative = df[df['outcome'] == 0]

    n_pos = len(positive)
    n_neg = len(negative)

    negative_test = negative.iloc[:int(n_neg * test_size)]
    negative_train = negative.iloc[int(n_neg * test_size):]
    positive_test = positive.iloc[:int(n_pos * test_size)]
    positive_train = positive.iloc[int(n_pos * test_size):]

    train_df = pd.concat([positive_train, negative_train])
    test_df = pd.concat([positive_test, negative_test])
    return train_df, test_df
 
def write_csvs(df, folder, output_path, batches):
    if not os.path.exists(output_path + '/' + folder):
        os.makedirs(output_path + '/' + folder)

    batch_size = len(df) // batches
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        batch.to_csv(f'{output_path}/{folder}/file{i}.csv', index=False)

def write_files_to_s3(output_path, bucket_name):
    !aws s3 rm --recursive s3://$bucket_name/$output_path
    !aws s3 cp --recursive $output_path s3://$bucket_name/$output_path

In [53]:
train_df, test_df = train_test_split(df)
train_len, test_len = len(train_df), len(test_df)
write_csvs(train_df, 'training', OUTPUT_PATH, BATCHES)
write_csvs(test_df, 'testing', OUTPUT_PATH, BATCHES)

write_files_to_s3(OUTPUT_PATH, BUCKET_NAME)

delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1008.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1323.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file0.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1134.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1197.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file126.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1449.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1071.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1512.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1386.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1260.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1575.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1638.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1764.csv
delete: s3://xy-mp-pipeline/data/covid-csv/testing/file1701.csv
delete: s3://xy-mp-pipeline/data/covid-csv/t

In [54]:
train_len, test_len

(8162, 2039)

## Create benchmarking model

In [55]:
from torchdata.datapipes.iter import IterableWrapper, IterDataPipe
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

class TextDataset(IterDataPipe):
    def __init__(self, s3_urls, tokenizer, length):
        super().__init__()
        self.tokenizer = tokenizer
        self.url_wrapper = s3_urls
        self.len = length

    def __iter__(self):
        for _, file in self.url_wrapper.load_files_by_s3():
            temp = pd.read_csv(file)
            label = torch.from_numpy(temp['outcome'].values)
            embedded = [self.tokenizer(t, padding='max_length', max_length=100, truncation=True, return_tensors='pt') for t in temp['headlines']]

            input_ids = torch.cat([e['input_ids'] for e in embedded], dim=0)
            attention_mask = torch.cat([e['attention_mask'] for e in embedded], dim=0)
            yield input_ids, attention_mask, label

    def __len__(self):
        return self.len



In [56]:
from transformers import BertModel

class FakeNewsClassifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = torch.nn.Dropout(0.25)
        self.linear = torch.nn.Linear(768, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return self.sigmoid(linear_output)
    

In [57]:
def train_model(model: torch.nn.Module, train_data_url: str, test_data_url: str, train_len: int, test_len:int, epochs: int, lr: float):
    # Prepare dataloaders
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    train_s3_url = IterableWrapper([train_data_url]).list_files_by_s3().shuffle().sharding_filter()
    test_s3_url = IterableWrapper([test_data_url]).list_files_by_s3().shuffle().sharding_filter()

    train_df = TextDataset(train_s3_url, tokenizer, train_len)
    test_df = TextDataset(test_s3_url, tokenizer, test_len)

    train_loader = DataLoader(train_df, batch_size=1, shuffle=True, collate_fn=lambda x: x)
    test_loader = DataLoader(test_df, batch_size=1, shuffle=True, collate_fn=lambda x: x)

    # Config device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Config optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_function = torch.nn.BCELoss()

    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Train
    for epoch in range(epochs):
        training_loss = 0.0
        training_acc = 0.0

        for data in train_loader:
            input_ids, mask, label = data[0]
            label = label.unsqueeze(1).to(device)
            mask = mask.to(device)
            input_ids = input_ids.squeeze(1).to(device)

            output = model(input_ids, mask)

            loss = loss_function(output, label.float())
            training_loss += loss.item()

            # get acc of signmoid output
            acc = (output[0].round() == label).sum().item()
            training_acc += acc

            model.zero_grad()
            loss.backward()
            optimizer.step()

        validation_loss = 0.0
        validation_acc = 0.0

        with torch.no_grad():
            for data in test_loader:
                input_ids, mask, label = data[0]
                label = label.unsqueeze(1).to(device)
                mask = mask.to(device)
                input_ids = input_ids.squeeze(1).to(device)

                output = model(input_ids, mask)

                loss = loss_function(output, label.float())
                validation_loss += loss.item()

                acc = (output[0].round() == label).sum().item()
                validation_acc += acc
        print(f'Epoch: {epoch+1}/{epochs} | Training loss: {training_loss/len(train_loader):.3f} | Training acc: {training_acc/len(train_loader):.3f} | Validation loss: {validation_loss/len(test_loader):.3f} | Validation acc: {validation_acc/len(test_loader):.3f}')

In [58]:
EPOCHS = 5
model = FakeNewsClassifier()
LR = 5e-6
TRAIN_S3_URL = f's3://{BUCKET_NAME}/{OUTPUT_PATH}/training/'
TEST_S3_URL = f's3://{BUCKET_NAME}/{OUTPUT_PATH}/testing/'
MODEL_OUTPUT_PATH = 'assets/model'

train_model(model, TRAIN_S3_URL, TEST_S3_URL, train_len, test_len, EPOCHS, LR)
# same model 
torch.save(model.state_dict(), MODEL_OUTPUT_PATH + '/baseline.pth')
print('Model saved to:  ', MODEL_OUTPUT_PATH)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 1/5 | Training loss: 0.001 | Training acc: 0.922 | Validation loss: 0.004 | Validation acc: 0.954
Epoch: 2/5 | Training loss: 0.001 | Training acc: 0.953 | Validation loss: 0.003 | Validation acc: 0.954
Epoch: 3/5 | Training loss: 0.001 | Training acc: 0.953 | Validation loss: 0.004 | Validation acc: 0.954
Epoch: 4/5 | Training loss: 0.001 | Training acc: 0.953 | Validation loss: 0.004 | Validation acc: 0.954
Epoch: 5/5 | Training loss: 0.000 | Training acc: 0.953 | Validation loss: 0.005 | Validation acc: 0.830
Model saved to:   assets/model


In [61]:
tests = df[df['outcome'] == 1].headlines.values
tests = [tokenizer(t, padding='max_length', max_length=100, truncation=True, return_tensors='pt') for t in tests]
input_ids = torch.cat([e['input_ids'] for e in tests], dim=0)
attention_mask = torch.cat([e['attention_mask'] for e in tests], dim=0)

In [62]:
model(input_ids, attention_mask)

tensor([[0.0365],
        [0.0366],
        [0.0386],
        [0.0505],
        [0.0351],
        [0.0891],
        [0.0552],
        [0.0680],
        [0.0461],
        [0.0467],
        [0.6530],
        [0.0315],
        [0.1040],
        [0.0318],
        [0.0952],
        [0.0628],
        [0.0487],
        [0.0398],
        [0.0303],
        [0.0745],
        [0.0558],
        [0.0318],
        [0.0341],
        [0.0414],
        [0.0325],
        [0.0289],
        [0.0565],
        [0.0258],
        [0.0467],
        [0.0280],
        [0.0202],
        [0.1002],
        [0.0671],
        [0.0407],
        [0.0403],
        [0.0535],
        [0.4472],
        [0.0296],
        [0.0496],
        [0.0241],
        [0.0483],
        [0.0241],
        [0.0352],
        [0.0543],
        [0.0453],
        [0.0360],
        [0.0441],
        [0.0488],
        [0.0294],
        [0.0551],
        [0.0312],
        [0.0489],
        [0.1632],
        [0.0483],
        [0.2091],
        [0