In [20]:
from transformers import AutoModel, DataCollatorWithPadding, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch
from huggingface_hub import login
import numpy as np
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
API_TOKEN = "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"
login(token=API_TOKEN)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    return_all_scores=True
)



Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\William\.cache\huggingface\token
Login successful




In [21]:
# Hyperparameters
batch_size = 32

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loss_model = AutoModelForCausalLM.from_pretrained("bert-base-uncased").to(device)
loss_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def add_loss_and_pickle(split):

    
    dataframe = pd.read_pickle(f"./pickle_files/{split}.pkl")
    data = Dataset.from_pandas(dataframe)
    perplexity_list = []

    for sample in tqdm(data):
        lm_out = loss_model(input_ids, attention_mask, output_hidden_states=True)
        outputs = lm_out.hidden_states[-1]
        #print(outputs)
        # print("lm output", outputs.shape, outputs.dtype)
        # print("outputs", outputs)
        #outputs = self.lstm(outputs)[0][:,-1]
        logits = torch.nn.functional.softmax(lm_out.logits, dim=-1)
        #print("logits", logits.shape, logits.dtype)
        probs = torch.gather(logits, dim=2, index=input_ids.unsqueeze(dim=2)).squeeze(-1)
        #print("probs", probs.shape, probs.dtype)
        subword_surp = -1 * torch.log2(probs) * attention_mask
        #print("subword_surp", subword_surp.shape, subword_surp.dtype)
        #print("subword_surp", subword_surp.shape, subword_surp.dtype)
        mean_surprisal = subword_surp.sum(dim=1) / attention_mask.sum(dim=1)
    data = data.add_column("perplexity", perplexity_list)
    dataframe = pd.DataFrame(data)
    print(dataframe.head(10))
    dataframe.to_pickle(f"./data/{split}.pkl")
    return

#add_loss_and_pickle("train")
#add_loss_and_pickle("validation")
#add_loss_and_pickle("test")



If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 10269/10269 [25:33<00:00,  6.70it/s] 


           id  label                                          statement  \
0   2635.json      0  Says the Annies List political group supports ...   
1  10540.json      1  When did the decline of coal start? It started...   
2    324.json      2  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json      0  Health care reform legislation is likely to ma...   
4   9028.json      1  The economic turnaround started at the end of ...   
5  12465.json      3  The Chicago Bears have had more starting quart...   
6   2342.json      4  Jim Dunnam has not lived in the district he re...   
7    153.json      1  "I'm the only person on this stage who has wor...   
8   5602.json      1  However, it took $19.5 million in Oregon Lotte...   
9   9741.json      2  Says GOP primary opponents Glenn Grothman and ...   

                                     subject                 speaker  \
0                                   abortion            dwayne-bohac   
1         energy,history,job-a

In [None]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-uncased")
print(model.config.hidden_size)
embedding_matrix = model.embeddings.word_embeddings.weight
embedding_matrix = embedding_matrix.detach().numpy()
print(embedding_matrix.shape)   

768
(30522, 768)


In [None]:
def tokenize(data):
    return tokenizer(data["statement"], truncation=True, max_length=512, padding=True)

def dataloader_from_pickle(split):
    dataframe = pd.read_pickle(f"./pickle_files/{split}.pkl")
    dataset = Dataset.from_pandas(dataframe)
    tokenized_dataset = dataset.map(tokenize, batch_size=batch_size, batched=True)
    tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'sentiment', 'perplexity'])
    return DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

train_dataloader = dataloader_from_pickle("train")
val_dataloader = dataloader_from_pickle("validation")
test_dataloader = dataloader_from_pickle("test")

print(next(iter(train_dataloader)).keys())

Map:   0%|          | 0/10269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

dict_keys(['sentiment', 'input_ids', 'attention_mask', 'labels'])


In [None]:
# custom NN model with BERT embeddings

class BERTClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.requires_grad_(False)
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.proj_size = 20
        self.hidden_size = 100
        self.lstm = torch.nn.LSTM(input_size=768, hidden_size=self.hidden_size, num_layers=2, batch_first=True, bidirectional=False, proj_size=self.proj_size)
        #self.classifier = torch.nn.Linear(self.proj_size+3, num_classes)
        self.classifier = torch.nn.Linear(768+3, num_classes)
        self.condenser = torch.nn.Linear(768, self.proj_size)

    def forward(self, input_ids, attention_mask, sentiment):
        # dummy forward pass, not real architecture
        outputs = self.bert(input_ids, attention_mask).last_hidden_state
        outputs = torch.mean(outputs, dim=1)
        #outputs = self.condenser(outputs)
        #outputs = self.lstm(outputs)[0][:,-1]
        # insert classification layers here
        # surprisal, sentiment, etc.
        outputs = self.classifier(torch.cat((outputs, sentiment), dim=1))
        return outputs

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

print(bnb_config.bnb_4bit_compute_dtype)

torch.bfloat16


In [None]:
# simple training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loss_fn = nn.CrossEntropyLoss()
model = BERTClassifier(6).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)

for i in range(10):
    model.train()
    losses = []
    predictions = []
    targets = []
    for i, batch in enumerate(val_dataloader):
        
        batch.to(device)
        
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        sentiment = batch["sentiment"]
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, sentiment)
        loss = loss_fn(outputs, labels)
        loss.backward() # this is not working
        optimizer.step()
        losses.append(loss.item())
        predictions.extend(outputs.detach().argmax(dim=1).to('cpu').tolist())
        targets.extend(labels.to('cpu').tolist())
    print(torch.cuda.max_memory_allocated())
    print(torch.cuda.memory_allocated())
    print(type(predictions), type(targets))
    total = len(targets)
    correct = np.sum(np.array(predictions) == np.array(targets))
    print("acc:", correct/total*100, "loss:", np.mean(losses))


3168100864
1792331776
<class 'list'> <class 'list'>
acc: 18.69158878504673 loss: 2.4384174288772957


KeyboardInterrupt: 

In [None]:


# input = "this is a sample input"


# # send input to tensor
# tokenized_input = tokenizer(input, return_tensors='pt').to(device)
# print("tokenize input")
# print(tokenized_input)
# embeddings = BERT(**tokenized_input)[0]
# print("get bert embeddings")
# print("\t", embeddings.shape)
# suprisal_values = torch.Tensor(np.random.uniform(0, 1, (1, embeddings.shape[1]))).to(device)
# print("get suprisal values")
# print("\t", suprisal_values.shape)
# input_features = torch.cat((embeddings, suprisal_values.unsqueeze(2)), dim=2)
# print("add suprisial values to embeddings")
# print("\t", input_features.shape)
# input_size = input_features.shape[2]

# hidden_size = 100
# dropout = 0
# classes = 2
# num_layers = 1

# lstm_layer = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, bidirectional=False,
#                                   num_layers=num_layers, batch_first=True, dropout=dropout, proj_size=1).to(device)
# lstm_output = lstm_layer(input_features)[0].squeeze(2)
# print("run input through lstm")
# print("\t", lstm_output.shape)
# sentiment_score = torch.Tensor(np.random.uniform(0, 1, (1, 3))).to(device)
# print("run input through sentiment classifier")
# print("\t", sentiment_score.shape)

# # add sentiment score to lstm output
# combined_output = torch.cat((lstm_output, sentiment_score), dim=1)
# print("add sentiment score to lstm output")
# print("\t", combined_output.shape)

# linear_layer = torch.nn.Linear(combined_output.shape[1], classes).to(device)
# linear_output = linear_layer(combined_output)
# print("run combined output through linear layer")
# print("\t", linear_output.shape)
# softmax = torch.nn.Softmax(dim=1)
# probabilities = softmax(linear_output)
# print("get probabilities")
# print("\t", probabilities)
# prediction = torch.argmax(probabilities, dim=1)
# print("get prediction")
# print(f"label:", prediction.item())
