In [1]:
from transformers import AutoTokenizer, AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn.functional as F
from torch import nn
from torch.optim import AdamW, SGD
import torch

# Define model

In [2]:
checkpoint = "distilbert-base-uncased"

In [3]:
class Lambda(nn.Module):
    """A neural network layer that applies the specified function to its inputs."""
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x): return self.func(x)

In [4]:
def print_shape(x):
    print(x.shape)
    return x

def flatten(x):
    return x.view(x.shape[0], -1)

In [5]:
class RegressiveTransformer(nn.Module):
    TRANSFORMER_HIDDEN_SIZE = 768
    LINEAR_HIDDEN_SIZE = 500
    
    def __init__(self, num_tokens):
        super(RegressiveTransformer, self).__init__()
        self.base_model = AutoModel.from_pretrained(checkpoint)
        self.head = nn.Sequential(
            nn.Linear(self.TRANSFORMER_HIDDEN_SIZE, self.LINEAR_HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(self.LINEAR_HIDDEN_SIZE, 1),
            Lambda(flatten),
            nn.Linear(num_tokens, 1),
            Lambda(flatten),
            Lambda(lambda x: x.squeeze())
        )
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        outputs = self.head(outputs[0])
        
        loss = None
        if labels is not None:
            loss = self.loss(outputs, labels)
    
        return SequenceClassifierOutput(loss=loss, logits=outputs)
    
    def freeze_base(self, freeze=True):
        for param in self.base_model.parameters():
            param.requires_grad = not freeze

# Test Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "Left left left",
    "Right right right",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

In [None]:
inputs

In [None]:
num_tokens = inputs['input_ids'].shape[1]

In [None]:
inputs['input_ids'].shape

In [None]:
model = RegressiveTransformer(num_tokens)

outputs = model(**inputs)
print(outputs)
print(outputs.size())

Alright, we started with a batch of our sentence inputs, and we now have a continuous value for each.

Next, we need some data to fine tune on.

# Data

- Alright, we have a csv with all this data
- We also need to bring in the media bias ratings

- Then we want a way to read from the CSV and generate as much training data as we want
- Let's look at some of the other data sets to look at the rough size that you'd use to finetune BERT
  - Looks like 10-100k rows

- I could preprocess this into a new CSV file with all the info condensed
- Validation set should include articles from publications not in the train set
- Evaluate accuracy based on whether predicted result is within delta of labelled value
  - Maybe delta starts pretty large, say 0.5 or 0.25

What data set do we need to establish the pipeline?

CSV of article => rating

# Fine tune

Next steps:

- [ ] Prepare real data CSV
- [ ] Set up DataLoaders

In [None]:
import pandas as pd

In [None]:
articles = pd.read_csv('data/trial.csv')

In [None]:
sequences = list(articles.title + ' ' + articles.article)
ratings = torch.tensor(articles.rating).float()

In [None]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [None]:
batch['labels'] = ratings

In [None]:
batch

In [None]:
num_tokens = batch['input_ids'].shape[1]

In [None]:
model = RegressiveTransformer(num_tokens)

In [None]:
model.freeze_base()

In [None]:
#optimizer = AdamW(model.parameters(), lr=0.0001)
optimizer = SGD(model.parameters(), lr=0.0001)

In [None]:
all_results = []

In [None]:
results = model(**batch)

In [None]:
results.loss

In [None]:
for i in range(100):
    results = model(**batch)
    all_results.append(results.logits.tolist())
    results.loss.backward()
    optimizer.step()

In [None]:
import matplotlib.pyplot as plt
plt.plot(all_results)
plt.show()

In [None]:
model(**batch)