# Convert Dataset from text to embeddings

In [1]:
# Module imports
import os

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import pandas as pd

In [2]:
# Setting up environment variables
os.environ['HF_HOME'] = '$HOME/tesis/00_src/run/models'

In [3]:
# Global variables
DATASET_NAME = 'Rivert97/ug-normativity'
#MODEL_NAME = 'multi-qa-mpnet-base-dot-v1'
#OUT_DIR = '/home/rgarcia/tesis/00_src/normativity-rag/outs/dataset/multi_qa_mpnet'
MODEL_NAME = 'multi-qa-distilbert-cos-v1'
OUT_DIR = '/home/rgarcia/tesis/00_src/normativity-rag/outs/dataset/multi_qa_distilbert'

In [4]:
# Loading embeddings model
model = SentenceTransformer(MODEL_NAME, device='cuda')

In [5]:
# Loading the questions
dataset = load_dataset(DATASET_NAME)
dataset = dataset['train']
dataset

Dataset({
    features: ['id', 'title', 'context', 'additional_context', 'question', 'answers'],
    num_rows: 1836
})

In [6]:
# Converting questions to embeddings
for title in sorted(set(dataset['title'])):
    filtered = dataset.filter(lambda row: row['title'] == title)

    embeddings = model.encode(filtered['question'])

    df = pd.DataFrame(embeddings)
    df.index = filtered['id']

    df.to_csv(os.path.join(OUT_DIR, f"{title}.csv"), sep=',')