In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers.models import Pooling
import torch

pooling_layer = Pooling(768)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "../../data/models/roberta-finetuned-liarpantsfire/checkpoint-10750/",
    output_hidden_states=True, num_labels=6)

model.to("cuda")
print("Loading done")

Loading done


In [2]:
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    inputs.to("cuda")
    
    if inputs['input_ids'].shape[1] > 510:
        limit = torch.LongTensor([i for i in range(510)]).to("cuda")
        inputs['input_ids'] = inputs['input_ids'].index_select(1, limit)
        inputs['attention_mask'] = inputs['attention_mask'].index_select(1, limit)
    
    token_logits = model(inputs['input_ids'])

    embeddings = token_logits.hidden_states[-1]

    inputs = {
        'attention_mask': inputs['attention_mask'],
        'token_embeddings': embeddings,
        'cls_token_embeddings': None
    }
    result = pooling_layer.forward(inputs)
    result = result['sentence_embedding'].cpu().detach().numpy()

    return result

def encode_and_save(texts, savefile = None):
    if not savefile:
        print("Please define savefile")
        return
    
    vectors = []
    for text in texts:
        vectors.append(encode_text(text))
    
    vectors = np.array(vectors).reshape((len(texts), 768))
    
    with open(f"../../data/processed/vectors/{savefile}.txt", "wb") as f:
        np.savetxt(f, vectors, delimiter=',', fmt='%s')

In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv("../../data/processed/liarpantsfire_dataset.csv", lineterminator="\n")
data.head()

Unnamed: 0,id,statement,label,tvt2,tvt2_1,tvt2_2,tvt2_3
0,2635.json,Says the Annies List political group supports ...,false,validation,validation,training,training
1,10540.json,When did the decline of coal start? It started...,half-true,training,training,validation,training
2,324.json,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true,training,training,validation,validation
3,1123.json,Health care reform legislation is likely to ma...,false,training,validation,testting,training
4,9028.json,The economic turnaround started at the end of ...,half-true,training,training,testting,training


In [4]:
data.shape

(12791, 7)

In [5]:
savefile = "LiarPantsFire_RoBERTa_base_finetuned_vectors"
texts = data['statement'].tolist()

encode_and_save(texts, savefile)

In [6]:
vectors = np.loadtxt(f"../../data/processed/vectors/{savefile}.txt", delimiter=",")
vectors.shape

(12791, 768)