In [1]:
import os
from tqdm import tqdm
from corpus import download_and_unzip, catgeories, read_text_files, corpus_root

download_and_unzip()

Already downloaded and extracted!


In [2]:
reviews = []
labels = []

# we can't use the previous tokenizers here
# idx 0 -> neg, 1 -> pos
for idx, cat in enumerate(catgeories):
    path = os.path.join(corpus_root, cat)
    texts = read_text_files(path)

    for i in tqdm(range(len(texts)), desc="prepare_corpus"):
        text = texts[i]
        reviews.append(text)
        labels.append(idx)

print()
print(len(reviews))
print(len(labels))

prepare_corpus: 100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]
prepare_corpus: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 998643.81it/s]


2000
2000





In [3]:
from transformers import AutoTokenizer
import torch

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
MAX_LEN=512

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    reviews, labels, random_state=42, train_size=0.8
)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

In [5]:
import torch
from torch.utils.data import Dataset

# custom dataset
class PolarityReviewDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        # encode review text
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation=True,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "text": review,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": torch.tensor(label)
        }

training_dataset = PolarityReviewDataset(x_train, y_train, tokenizer)
val_dataset = PolarityReviewDataset(x_val, y_val, tokenizer)

In [6]:
from torch.utils.data import DataLoader
import multiprocessing

usable_cpu_cores = multiprocessing.cpu_count() - 2 # keep 2 free to keep system responsive

batch_size = 16

# loader from custom dataset
train_loader = DataLoader(training_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

In [7]:
import torch
from torch.nn import functional as F
from torch import nn

import pytorch_lightning as pl
from pytorch_lightning.core.lightning import LightningModule

from transformers import AutoModel


from torch.optim import Adam


class SentiBERT(LightningModule):
    def __init__(self, model_name=model_name):
        super(SentiBERT, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.linear = nn.Linear(768, 1) # 768 for BERT, 1 for binary classification
        self.sigmoid = nn.Sigmoid()
        
        self.loss_fn = nn.BCELoss()

    def forward(self, x):
        input_ids, attention_mask = x
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = out.pooler_output
        
        out = self.linear(out)
        out = self.sigmoid(out)

        return out
    
    
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=2e-5)
    
    
    def training_step(self, batch, batch_idx):
        td = batch
        
        
        input_ids = td["input_ids"]
        attention_mask = td["attention_mask"]
        label = td["label"]
        
        out = self((input_ids, attention_mask))
        logits, _ = torch.max(out, dim=1)
        loss = self.loss_fn(logits, label.float())
        
        return {"loss": loss}
    
    
    def validation_step(self, batch, batch_idx):
        td = batch
        
        
        input_ids = td["input_ids"]
        attention_mask = td["attention_mask"]
        label = td["label"]
        
        out = self((input_ids, attention_mask))
        logits, _ = torch.max(out, dim=1)
        loss = self.loss_fn(logits, label.float())
        
        self.log('val_loss', loss, prog_bar=True)

    
model = SentiBERT()
trainer = pl.Trainer(gpus=1, max_epochs=2)
trainer.fit(model, train_loader, val_loader)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------

Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]