In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
model_name = "monologg/distilkobert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

df = pd.read_csv('train_listener_text_data.csv') 

def encode_sentences(sentences, tokenizer, model):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Returns a 3D tensor containing embeddings for each token
    token_embeddings = model_output[0]  # (batch_size, sequence_length, hidden_size)
    
    # Convert to numpy array
    token_embeddings = token_embeddings.numpy()
    return token_embeddings

token_embeddings = encode_sentences(df['Text'].tolist(), tokenizer, model)
print(token_embeddings.shape)  # The result is in the form of (batch_size, sequence_length, hidden_size)

In [None]:
df = pd.read_csv('val_listener_text_data.csv')

sentence_embeddings_val = encode_sentences(df['Text'].tolist(), tokenizer, model)

In [None]:
df = pd.read_csv('test_listener_text_data.csv')

sentence_embeddings_test = encode_sentences(df['Text'].tolist(), tokenizer, model)

In [None]:
# Set fixed sequence length
MAX_SEQ_LEN = 210

def pad_or_truncate(features, max_seq_len):
    # Function to unify sequence lengths to max_seq_len
    padded_features = []
    for feature in features:
        if len(feature) > max_seq_len:
            # If sequence length is longer than max_seq_len, truncate it
            padded_feature = feature[:max_seq_len]
        else:
            # If sequence length is shorter than max_seq_len, pad it
            padded_feature = pad_sequences([feature], maxlen=max_seq_len, dtype='float32', padding='post', truncating='post')
            padded_feature = padded_feature[0]  # pad_sequences returns a 3D tensor, so we select the first element
        padded_features.append(padded_feature)
    return np.array(padded_features)

# Apply sequence length unification to training, validation, and test data
train_text_features = pad_or_truncate(token_embeddings, MAX_SEQ_LEN)
val_text_features = pad_or_truncate(sentence_embeddings_val, MAX_SEQ_LEN)
test_text_features = pad_or_truncate(sentence_embeddings_test, MAX_SEQ_LEN)