In [2]:
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from huggingface_hub import login
import numpy as np
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
API_TOKEN = "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"
login(token=API_TOKEN)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    return_all_scores=True
)

dataset = load_dataset("liar")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\William\.cache\huggingface\token
Login successful


In [None]:
# Hyperparameters
batch_size = 32

In [3]:
class CustomDataset(Dataset):
    """
    CustomDataset is a class for creating a dataset in PyTorch, inheriting from the PyTorch Dataset class.
    This dataset is designed to handle tabular data provided as pandas DataFrames.

    Attributes:
        features (pd.DataFrame): A DataFrame containing the features of the dataset.
        labels (pd.Series or pd.DataFrame): A Series or DataFrame containing the labels of the dataset.
    Methods:
        __getitem__(self, index): Returns the features and label for a given index.
        __len__(self): Returns the total number of samples in the dataset.
    """
    def __init__(self, features, labels):
        """
        Parameters:
            features (pd.DataFrame): The features of the dataset.
            labels (pd.Series or pd.DataFrame): The labels of the dataset.
        """
        self.features = pd.DataFrame(features)
        self.labels = pd.DataFrame(labels)

    def __getitem__(self, index):
        """
        Parameters:
            index (int): The index of the item to retrieve.
        Returns:
            tuple: A tuple containing the features as a numpy array and the label.
        """
        features = self.features.iloc[index].to_numpy()
        label = [self.labels.iloc[index]]
        return features, label

    def __len__(self):
        """
        Returns:
            int: The total number of samples.
        """
        return len(self.features)

def tokenize(data):
    return tokenizer(data["statement"], truncation=True, max_length=512, padding=True)

In [5]:
train = dataset["validation"] # for now, cuz it's smaller

sentiments_list = []

for statement in tqdm(train["statement"]):
    scores = distilled_student_sentiment_classifier(statement)[0]
    sentiments = [sentiment["score"] for sentiment in scores]
    sentiments_list.append(sentiments)

train = train.add_column("sentiment", sentiments_list)

print(train)

tokenized_dataset = train.map(tokenize, batch_size=batch_size, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'sentiment'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

print(next(iter(train_dataloader)).keys())

  0%|          | 0/1284 [00:00<?, ?it/s]

100%|██████████| 1284/1284 [01:42<00:00, 12.50it/s]


1284 1284
Dataset({
    features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'sentiment'],
    num_rows: 1284
})


Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [7]:
# custom NN model with BERT embeddings

class BERTClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.requires_grad_(False)
        self.bert = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
        self.proj_size = 20
        self.hidden_size = 100
        self.lstm = torch.nn.LSTM(input_size=768, hidden_size=self.hidden_size, num_layers=2, batch_first=True, bidirectional=False, proj_size=self.proj_size)
        self.classifier = torch.nn.Linear(self.proj_size+3, num_classes)

    def forward(self, input_ids, attention_mask, sentiment):
        # dummy forward pass, not real architecture
        outputs = self.bert(input_ids, attention_mask).last_hidden_state
        outputs = self.lstm(outputs)[0][:,-1]
        # insert classification layers here
        # surprisal, sentiment, etc.
        outputs = self.classifier(torch.cat((outputs, sentiment), dim=1))
        return outputs

In [9]:
# simple training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loss_fn = nn.CrossEntropyLoss()
model = BERTClassifier(6).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

for i in range(100):
    model.train()
    losses = []
    predictions = []
    targets = []
    total = 0
    correct = 0
    for i, batch in tqdm(enumerate(train_dataloader)):
        batch.to(device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        sentiment = batch["sentiment"]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, sentiment)
        loss = loss_fn(outputs, labels)
        loss.backward() # this is not working
        optimizer.step()
        losses.append(loss.item())

        predictions.extend(outputs.detach().argmax(dim=1))
        targets.extend(labels)
        # for sample in zip(batch["labels"], outputs.detach().argmax(dim=1)):
        #     total += 1
        #     if sample[0] == sample[1]:
        #         correct += 1
        batch.to('cpu')
    total = len(targets)
    correct = np.sum(np.array(predictions) == np.array(targets))
    print(correct/total*100, np.mean(losses))
    print(predictions)
model.to('cpu')

11it [01:43,  9.38s/it]


KeyboardInterrupt: 

In [None]:


# input = "this is a sample input"


# # send input to tensor
# tokenized_input = tokenizer(input, return_tensors='pt').to(device)
# print("tokenize input")
# print(tokenized_input)
# embeddings = BERT(**tokenized_input)[0]
# print("get bert embeddings")
# print("\t", embeddings.shape)
# suprisal_values = torch.Tensor(np.random.uniform(0, 1, (1, embeddings.shape[1]))).to(device)
# print("get suprisal values")
# print("\t", suprisal_values.shape)
# input_features = torch.cat((embeddings, suprisal_values.unsqueeze(2)), dim=2)
# print("add suprisial values to embeddings")
# print("\t", input_features.shape)
# input_size = input_features.shape[2]

# hidden_size = 100
# dropout = 0
# classes = 2
# num_layers = 1

# lstm_layer = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, bidirectional=False,
#                                   num_layers=num_layers, batch_first=True, dropout=dropout, proj_size=1).to(device)
# lstm_output = lstm_layer(input_features)[0].squeeze(2)
# print("run input through lstm")
# print("\t", lstm_output.shape)
# sentiment_score = torch.Tensor(np.random.uniform(0, 1, (1, 3))).to(device)
# print("run input through sentiment classifier")
# print("\t", sentiment_score.shape)

# # add sentiment score to lstm output
# combined_output = torch.cat((lstm_output, sentiment_score), dim=1)
# print("add sentiment score to lstm output")
# print("\t", combined_output.shape)

# linear_layer = torch.nn.Linear(combined_output.shape[1], classes).to(device)
# linear_output = linear_layer(combined_output)
# print("run combined output through linear layer")
# print("\t", linear_output.shape)
# softmax = torch.nn.Softmax(dim=1)
# probabilities = softmax(linear_output)
# print("get probabilities")
# print("\t", probabilities)
# prediction = torch.argmax(probabilities, dim=1)
# print("get prediction")
# print(f"label:", prediction.item())
