# Generate Embedding

In [None]:
import pandas as pd
import numpy as np
import faiss
import openai
from openai import OpenAI
import re 
import pickle
import transformers
from transformers import BertTokenizer, BertModel , AutoTokenizer, ErnieModel, AutoModel
import torch
from transformers import RobertaTokenizer, RobertaModel



In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000) 
pd.set_option('display.max_colwidth', None)

In [None]:
client = OpenAI(api_key="")

## Clean text

In [None]:
df_kb = pd.read_csv(r'df_kb_generation.csv')
df_kb[['new_claim','new_expl']].head(10)

In [None]:
#Clean text
# Rimuovi le parti precedenti a :\r\n
df_kb['new_claim'] = df_kb['new_claim'].apply(lambda x: re.sub(r'.*:\r\n', '', x)) 
df_kb['new_expl'] = df_kb['new_expl'].apply(lambda x: re.sub(r'.*:\r\n', '', x))

In [None]:
#Clean text
# Rimuovi le stringhe esattamente uguali a \r\n
df_kb['new_claim'] = df_kb['new_claim'].replace('"""\r\n', '', regex=True) 
df_kb['new_claim'] = df_kb['new_claim'].replace('\r\n"""', '', regex=True)
df_kb['new_expl'] = df_kb['new_expl'].replace('"""\r\n', '', regex=True) 
df_kb['new_expl'] = df_kb['new_expl'].replace('\r\n"""', '', regex=True)

In [None]:
df_kb[['new_claim','new_expl']].head(10)

## ADA

In [None]:
df_claim = df_kb[["new_claim","label"]]
df_claim.head(5)

In [None]:
def get_embedding(input, model="text-embedding-3-small", encoding_format = "float"):
  obj = client.embeddings.create(
    model=model,
    input=input,
    encoding_format=encoding_format
  )
  return obj.data[0].embedding

In [None]:
claims = df_claim["new_claim"].tolist()
vectors_list = [get_embedding(claim) for claim in claims]
vectors = np.array(vectors_list, dtype=np.float32)

In [None]:
with open("embeddings_claims_ADA.pkl", "wb") as f:
    pickle.dump(vectors, f)

In [None]:
df_explaination = df_kb[["new_expl","label"]]
df_explaination.head(5)

In [None]:
explainations = df_explaination["new_expl"].tolist()
vectors_list_expl = [get_embedding(explaination) for explaination in explainations]
vectors_expl = np.array(vectors_list_expl, dtype=np.float32)

In [None]:
with open("embeddings_explaination_ADA.pkl", "wb") as f:
    pickle.dump(vectors_expl,f)

In [None]:
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

## RoBERTa

In [None]:
# Inizializzazione del tokenizer e del modello
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')


In [None]:
def get_bert_embedding(sentence):
    # Tokenizzazione: conversione della frase in token e aggiunta dei token speciali
    encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    # Calcolo dell'embedding
    with torch.no_grad():
        output = model(**encoded_input)
    # Estrazione dell'embedding del token CLS (puoi anche usare un approccio diverso, ad es. media degli embedding)
    embedding = output.last_hidden_state[:,0,:].numpy()
    return embedding

In [None]:
claims = df_kb["new_claim"].tolist()

In [None]:
# Calcolo degli embedding per ogni frase nel dataset
vectors_list = [get_bert_embedding(claim) for claim in claims]
vectors = np.vstack(vectors_list)

In [None]:
with open("embeddings_claims_RoBERT.pkl", "wb") as f:
    pickle.dump(vectors, f)

In [None]:
explanations = df_kb["new_expl"].tolist()

In [None]:
# Calcolo degli embedding per ogni frase nel dataset
vectors_list = [get_bert_embedding(explanation) for explanation in explanations]
vectors = np.vstack(vectors_list)

In [None]:
with open("embeddings_explanation_RoBERT.pkl", "wb") as f:
    pickle.dump(vectors, f)

 ## ERNIE 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")
model = ErnieModel.from_pretrained("nghuyong/ernie-2.0-base-en")

In [None]:
def get_bert_embedding(sentence):
    # Tokenizzazione: conversione della frase in token e aggiunta dei token speciali
    encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    # Calcolo dell'embedding
    with torch.no_grad():
        output = model(**encoded_input)
    # Estrazione dell'embedding del token CLS (puoi anche usare un approccio diverso, ad es. media degli embedding)
    embedding = output.last_hidden_state[:,0,:].numpy()
    return embedding

In [None]:
claims = df_kb["new_claim"].tolist()

In [None]:
# Calcolo degli embedding per ogni frase nel dataset
vectors_list = [get_bert_embedding(claim) for claim in claims]
vectors = np.vstack(vectors_list)

In [None]:
with open("embeddings_claims_ERNIE.pkl", "wb") as f:
    pickle.dump(vectors, f)

In [None]:
explanations = df_kb["new_expl"].tolist()

In [None]:
# Calcolo degli embedding per ogni frase nel dataset
vectors_list = [get_bert_embedding(explanation) for explanation in explanations]
vectors = np.vstack(vectors_list)

In [None]:
with open("embeddings_explanation_ERNIE.pkl", "wb") as f:
    pickle.dump(vectors, f)

## Distil-Bert

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained("distilbert-base-uncased")

In [None]:
def get_bert_embedding(sentence):
    encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    # Calcolo dell'embedding
    with torch.no_grad():
        output = model(**encoded_input)
    # Estrazione dell'embedding del token CLS (puoi anche usare un approccio diverso, ad es. media degli embedding)
    embedding = output.last_hidden_state[:,0,:].numpy()
    return embedding

In [None]:
claims = df_kb["new_claim"].tolist()

In [None]:
# Calcolo degli embedding per ogni frase nel dataset
vectors_list = [get_bert_embedding(claim) for claim in claims]
vectors = np.vstack(vectors_list)

In [None]:
with open("embeddings_claims_distilBERT.pkl", "wb") as f:
    pickle.dump(vectors, f)

In [None]:
explanations = df_kb["new_expl"].tolist()

In [None]:
# Calcolo degli embedding per ogni frase nel dataset
vectors_list = [get_bert_embedding(explanation) for explanation in explanations]
vectors = np.vstack(vectors_list)

In [None]:
with open("embeddings_explanation_distilBERT.pkl", "wb") as f:
    pickle.dump(vectors, f)