# Semantic Matching  

## Preliminary Operations

In [2]:
import pandas as pd
import numpy as np
import faiss
import openai
from openai import OpenAI
from sklearn.metrics import f1_score, accuracy_score, precision_score, confusion_matrix

In [9]:
pd.set_option('display.width', 1000) 
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [3]:
client = OpenAI(api_key="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")

In [4]:
df_kb = pd.read_csv(r'df_kb_generation.csv')
df_test = pd.read_csv(r'df_test_generation.csv')

## Matching New Claim

In [None]:
df_kb[['new_claim','new_expl']]

In [5]:
claims = df_kb["new_claim"].tolist()
len(claims)

5078

In [6]:
def get_embedding(input, model="text-embedding-3-small", encoding_format = "float"):
  obj = client.embeddings.create(
    model=model,
    input=input,
    encoding_format=encoding_format
  )
  return obj.data[0].embedding

In [7]:
import pickle
with open("embeddings_claims.pkl", "rb") as f:
    vectors_claim = pickle.load(f)

In [8]:
vectors_claim.shape[1]

1536

In [9]:
np.arange(len(claims))

array([   0,    1,    2, ..., 5075, 5076, 5077])

In [10]:
index = faiss.IndexIDMap(faiss.IndexFlatIP(vectors_claim.shape[1]))
faiss.normalize_L2(vectors_claim)
index.add_with_ids(vectors_claim, np.arange(len(claims)))

In [11]:
def index_search(query, k=1):
    query_vector =get_embedding(query)
    query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
    faiss.normalize_L2(query_vector)
    top_k = index.search(query_vector, k)
    
    return(top_k[0], [
        claims[_id] for _id in top_k[1][0]
    ])

In [12]:
index_search('Cigarette cause cancer', k=3)

(array([[0.59109044, 0.5069936 , 0.47763616]], dtype=float32),
 ['Tobacco has been reported to cause 2.4 million cases of cancer in the United States.',
  'Newly generated assertion:\r\n"The presence of nicotine in cigarette butts poses a serious health risk to babies and small children."',
  'Quit smoking is strongly recommended by experts and tobacco companies to reduce the risk of COVID-19.'])

In [13]:
def verify_claim(claim, k=3, thresholds=[0.5, 0.7, 0.9]):
    # Ottieni i risultati della ricerca FAISS per la nuova affermazione
    similarity_scores, matched_claims = index_search(claim, k=k)
    
    # La similarità massima trovata tra la nuova affermazione e le corrispondenze
    max_similarity = np.max(similarity_scores) if similarity_scores.size > 0 else 0
    
    # Determina se l'affermazione è vera secondo le soglie di similarità
    verdicts = {}
    for threshold in thresholds:
        verdicts[threshold] = "True" if max_similarity >= threshold else "False"
    
    return verdicts, max_similarity, matched_claims

In [14]:
# Esempio di utilizzo della funzione
claim = 'Cigarette cause cancer'
verdicts, max_similarity, matched_claims = verify_claim(claim, k=3)

print(f"Verdicts: {verdicts}")
print(f"Max Similarity: {max_similarity}")
print("Matched Claims:")
for claim in matched_claims:
    print(f"- {claim}")

Verdicts: {0.5: 'True', 0.7: 'False', 0.9: 'False'}
Max Similarity: 0.5910904407501221
Matched Claims:
- Tobacco has been reported to cause 2.4 million cases of cancer in the United States.
- Newly generated assertion:
"The presence of nicotine in cigarette butts poses a serious health risk to babies and small children."
- Quit smoking is strongly recommended by experts and tobacco companies to reduce the risk of COVID-19.


In [24]:
def print_verification_results(claim, threshold=0.5):
    verdicts, max_similarity, matched_claims = verify_claim(claim, k=3)
    is_true = verdicts[threshold] == "True"
    print(f"Claim: {claim}")
    print(f"Verdict at threshold {threshold}: {'True' if is_true else 'False'}")
    print("Matched Claims from Knowledge Base:")
    for matched_claim in matched_claims:
        print(f"- {matched_claim}")
    print(f"Max Similarity: {max_similarity}\n")

In [26]:
# Applica la funzione apply_verify_and_get_binary_verdict a ogni new_claim nel DataFrame
df_test['predicted_label'] = df_test['new_claim'].apply(print_verification_results, threshold=0.5)

true_labels = df_test['label'].apply(lambda x: 'true' if x == 'true' else 'false')
true_labels = true_labels.map({'true': 1, 'false': 0})
predicted_labels = df_test['predicted_label'].fillna('false') # Sostituisce NaN con 'false'
predicted_labels = predicted_labels.map({True: 1, False: 0, 'false': 0})  

# Calcolo delle metriche
f1 = f1_score(true_labels, predicted_labels)
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, zero_division=0)

print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")

cm_a = confusion_matrix(true_labels, predicted_labels)
    
tn_a, fp_a, fn_a, tp_a = cm_a.ravel()

print(f"\nMetodo 'a' - Matrice di Confusione:")
print(f"TN: {tn_a}, FP: {fp_a}, FN: {fn_a}, TP: {tp_a}")


Claim: "The mother, in a letter to her child after her death, disclosed that she only had one eye because she had given her other eye to him."
Verdict at threshold 0.5: False
Matched Claims from Knowledge Base:
- The daughter expressed her true feelings towards her late mother through a harsh and critical obituary.
- Newly generated assertion:
"The experience of miscarriages deeply influences the artwork of a mom-to-be, reflecting her emotional journey in a powerful way."
- Newly generated assertion:
"The mother's call for changes at Salem Hospital follows the tragic death of her baby."
Max Similarity: 0.48532429337501526

Claim: "Research indicates that a significant number of Americans continue to consume alcohol excessively."
Verdict at threshold 0.5: False
Matched Claims from Knowledge Base:
- Europeans, as reported by the WHO, have been identified as the heaviest drinkers globally.
- Based on the data, it is indicated that a significant percentage of American adults have engaged i