In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers.models import Pooling

pooling_layer = Pooling(768)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "../../data/models/roberta-finetuned-phemernr1-rnr/checkpoint-4810/",
    output_hidden_states=True, num_labels=2)

model.to("cuda")
print("Loading done")

Loading done


In [2]:
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt")
    inputs.to("cuda")
    token_logits = model(inputs['input_ids'])

    embeddings = token_logits.hidden_states[-1]

    inputs = {
        'attention_mask': inputs['attention_mask'],
        'token_embeddings': embeddings,
        'cls_token_embeddings': None
    }
    result = pooling_layer.forward(inputs)
    result = result['sentence_embedding'].cpu().detach().numpy()

    return result

def encode_and_save(texts, savefile = None):
    if not savefile:
        print("Please define savefile")
        return
    
    vectors = []
    for text in texts:
        vectors.append(encode_text(text))
    
    vectors = np.array(vectors).reshape((len(texts), 768))
    
    with open(f"../../data/vectors/{savefile}.txt", "wb") as f:
        np.savetxt(f, vectors, delimiter=',', fmt='%s')

In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv("../../data/phemernr1_dataset_with_tvt.csv", lineterminator="\n")
data.head()

Unnamed: 0,tweet_text,topic,label,tvt,cv_fold,tvt2
0,BOMBSHELL: #Ferguson chief says the police off...,ferguson,rumours,test,3,validation
1,It appears that #Ferguson PD are trying to ass...,ferguson,rumours,training,1,training
2,"All weekend ppl will be talking about the ""rob...",ferguson,rumours,test,3,training
3,Why would the officer tell #MikeBrown to get o...,ferguson,rumours,test,3,validation
4,Michael Brown is the 17 yr old boy who was sho...,ferguson,rumours,training,0,training


In [4]:
data.shape

(5802, 6)

In [5]:
savefile = "Phemernr1-RNR_RoBERTa_base_finetuned_vectors"
texts = data['tweet_text'].tolist()

encode_and_save(texts, savefile)

In [6]:
vectors = np.loadtxt(f"../../data/vectors/{savefile}.txt", delimiter=",")
vectors.shape

(5802, 768)

In [7]:
features = np.loadtxt(f"../../data/phemernr1_dataset_features_vectors.csv", delimiter=",")
features.shape

(5802, 39)

In [8]:
zeros = np.zeros((features.shape[0], 1))
features = np.hstack((features, zeros))
features.shape

(5802, 40)

In [9]:
features = np.tile(features, (1, 4))
features.shape

(5802, 160)

In [10]:
fvectors = np.concatenate((vectors, features), axis=1)
fvectors.shape

(5802, 928)

In [11]:
with open(f"../../data/vectors/Phemernr1_with_features-RNR_RoBERTa_base_finetuned_vectors.txt", "wb") as f:
    np.savetxt(f, fvectors, delimiter=',', fmt='%s')

In [12]:
fvectors = np.loadtxt(f"../../data/vectors/Phemernr1_with_features-RNR_RoBERTa_base_finetuned_vectors.txt", delimiter=",")
fvectors.shape

(5802, 928)