In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd /content/gdrive/My Drive/cross-encode/

/content/gdrive/My Drive/cross-encode


In [None]:
pip install faiss-gpu sentence_transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import time
import numpy as np
import pandas as pd
import torch
import faiss

from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator, CECorrelationEvaluator
from sentence_transformers.readers import InputExample

import datasets

In [None]:
def get_embeddings_from_contexts(model, contexts): # for embeddings
    """
    It takes a list of contexts and returns a list of embeddings
    
    :param model: the model you want to use to get the embeddings
    :param contexts: a list of strings, each string is a context
    :return: The embeddings of the contexts
    """
    return model.encode(contexts)

def load_semantic_search_model(model_name):
    """
    It loads the model
    
    :param model_name: The name of the model to load
    :return: A sentence transformer object
    """
    from sentence_transformers import SentenceTransformer

    return SentenceTransformer(model_name)

In [None]:
def convert_embeddings_to_faiss_index(embeddings, context_ids):
    """
    We take in a list of embeddings and a list of context IDs, convert the embeddings to a numpy array,
    instantiate a flat index, pass the index to IndexIDMap, add the embeddings and their IDs to the
    index, instantiate the resources, and move the index to the GPU
    
    :param embeddings: The embeddings you want to convert to a faiss index
    :param context_ids: The IDs of the contexts
    :return: A GPU index
    """
    embeddings = np.array(embeddings).astype("float32")  # Step 1: Change data type

    index = faiss.IndexFlatIP(embeddings.shape[1])  # Step 2: Instantiate the index
    index = faiss.IndexIDMap(index)  # Step 3: Pass the index to IndexIDMap

    index.add_with_ids(embeddings, context_ids)  # Step 4: Add vectors and their IDs

    res = faiss.StandardGpuResources()  # Step 5: Instantiate the resources
    gpu_index = faiss.index_cpu_to_gpu(
        res, 0, index
    )  # Step 6: Move the index to the GPU
    return gpu_index

In [None]:
def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level
    model and finds similar vectors using FAISS.
    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I


def id2details(df, I, column):
    """Returns the paper titles based on the paper index."""
    return [list(df[df.index.values == idx][column]) for idx in I[0]]


def combine(user_query, model, index, df, column, num_results=10):
    """
    It takes a user query, a model, an index, a dataframe, and a column name, and returns the top 5
    results from the dataframe
    
    :param user_query: the query you want to search for
    :param model: the model we trained above
    :param index: the index of the vectorized dataframe
    :param df: the dataframe containing the data
    :param column: the column in the dataframe that contains the text you want to search
    :param num_results: the number of results to return, defaults to 5 (optional)
    :return: the top 5 results from the vector search.
    """
    D, I = vector_search([user_query], model, index, num_results=num_results)
    return id2details(df, I, column)

In [None]:
def get_context(model, query, contexts, contexts_emb, top_k=5):
    """
    Given a query, a list of contexts, and their embeddings, return the top k contexts with the highest
    similarity score.
    
    :param model: the model we trained in the previous section
    :param query: the query string
    :param contexts: list of contexts
    :param contexts_emb: the embeddings of the contexts
    :param top_k: the number of contexts to return, defaults to 3 (optional)
    :return: The top_context is a list of the top 3 contexts that are most similar to the query.
    """
    # Encode query and contexts with the encode function
    query_emb = model.encode(query)
    query_emb = torch.from_numpy(query_emb.reshape(1, -1))
    contexts_emb = torch.from_numpy(contexts_emb)
    # Compute similiarity score between query and all contexts embeddings
    scores = util.cos_sim(query_emb, contexts_emb)[0].cpu().tolist()
    # Combine contexts & scores
    contexts_score_pairs = list(zip(contexts, scores))

    result = sorted(contexts_score_pairs, key=lambda x: x[1], reverse=True)[:top_k]

    top_context = []
    for c, s in result:
        top_context.append(c)
    return top_context

In [None]:
def get_answer(model, query, context):
    """
    > Given a model, a query, and a context, return the answer
    
    :param model: the model we just loaded
    :param query: The question you want to ask
    :param context: The context of the question
    :return: A string
    """

    formatted_query = f"{query}\n{context}"
    res = model(formatted_query)
    return res[0]["generated_text"]



def evaluate_semantic_model(model, questions, contexts, contexts_emb, index=None):

    """
    For each question, we use the model to find the most similar context.
    
    :param model: the model we're using to evaluate
    :param questions: a list of questions
    :param contexts: the list of contexts
    :param contexts_emb: the embeddings of the contexts
    :param index: the index of the context embeddings
    :return: The predictions are being returned.
    """
    predictions = [
        combine(question, model, index, contexts, "premise") #for faiss
        if index
        else get_context(model, question, contexts, contexts_emb) #for cosine
        for question in questions
    ]

    return predictions


In [None]:
semantic_search_model = load_semantic_search_model("distiluse-base-multilingual-cased-v1") # or all-mpnet-base-v2

In [None]:
df = pd.read_csv('syntheic-dataset.csv')
contexts = df.premise.unique()
contexts = pd.DataFrame(contexts, columns = ['premise'])
#encode raw contexts to embedding vectors
context_emb = get_embeddings_from_contexts(
    semantic_search_model, contexts.premise.values
)
# only need for faiss index
index = convert_embeddings_to_faiss_index(context_emb, contexts.index.values)


In [None]:
model_saved = CrossEncoder('output-nli', max_length=512)

In [None]:
query = ['Quelles protections la Loi sur la protection du consommateur accorde-t-elle aux individus?']


pred = evaluate_semantic_model(
    semantic_search_model,
    query,
    contexts,
    context_emb,
    index,
)
# pred[0]
pred = [i[0] for i in pred[0]]

In [None]:
# So we create the respective sentence combinations
sentence_combinations = [[query[0], corpus_sentence] for corpus_sentence in pred]

# Compute the similarity scores for these combinations
similarity_scores = model_saved.predict(sentence_combinations)

# Sort the scores in decreasing order
sim_scores_argsort = reversed(np.argsort(similarity_scores))

# Print the scores
print("Query:", query)
for idx in sim_scores_argsort:
    print("{:.2f}\t{}".format(similarity_scores[idx], pred[idx]))

Query: ['Quelles protections la Loi sur la protection du consommateur accorde-t-elle aux individus?']
0.74	l'action preventive en suppression de clauses abusives ouverte aux associations agreees de defense des consommateurs a vocation a s'appliquer aux modeles types de contrats destines aux consommateurs et rediges par des professionnels en vue d'une utilisation generalisee l'action preventive en suppression des clauses illicites ouverte aux associations agreees de defense des consommateurs a vocation a s'appliquer aux modeles types de contrats destines aux consommateurs et rediges par des professionnels en vue d'une utilisation generalisee les dispositions imperatives de l'article 0 - 1 de la loi n 70 - 598 de la loi du 9 juillet 1970 s'appliquent , par la generalite de leurs termes , aux locations saisonnieres qui portent sur des locaux d'habitation c'est a juste titre qu'une cour d'appel ordonne , a la demande d'une association agreee de defense des consommateurs , la suppression de

### second test model with logits

In [None]:
model_CE = CrossEncoder('amberoad/bert-multilingual-passage-reranking-msmarco', max_length=512)

Downloading:   0%|          | 0.00/696 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""

# So we create the respective sentence combinations
sentence_combinations = [[query[0], corpus_sentence] for corpus_sentence in pred]

# Compute the similarity scores for these combinations
similarity_scores = model_CE.predict(sentence_combinations)
# label_mapping = ['contradiction', 'entailment', 'neutral']
scores = [(score_max[0],idx) for idx,score_max in enumerate(similarity_scores) if score_max.argmax(axis=0)==0]

# Sort the scores in decreasing order
sim_scores_argsort = sorted(scores, key=lambda x: x[0], reverse=True)

# # Print the scores

print("Query:", query)
for score, idx in sim_scores_argsort:
    print("{:.2f}\t{}".format(score, pred[idx]))

Query: ['Quelles protections la Loi sur la protection du consommateur accorde-t-elle aux individus?']
4.83	si les associations agreees de consommateurs peuvent intervenir a l'instance introduite sur la demande initiale en reparation du prejudice subi par un ou plusieurs consommateurs , en raison de faits non constitutifs d'une infraction penale , a l'effet notamment d'obtenir reparation du prejudice cause a l'interet collectif des consommateurs , en revanche elles ne peuvent , a cette fin , introduire l'instance  est des lors irrecevable la demande en reparation du prejudice cause a l'interet collectif des consommateurs , formee par une association agreee de consommateurs dans l'acte par lequel celle-ci et un consommateur , agissant en reparation de son propre prejudice , ont conjointement introduit l'instance
4.83	selon l'article l. 33 - 2 du code de la consommation , les clauses des contrats proposes par les professionnels aux consommateurs ou aux non-professionnels s'interpretent , 

In [None]:
scores

[(0.110301666, 1)]