In [None]:
!pip install sentence_transformers
!pip install faiss-cpu -f https://github.com/nvidia/faiss/releases/tag/v1.7.1

Looking in links: https://github.com/nvidia/faiss/releases/tag/v1.7.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


In [None]:
from sentence_transformers import SentenceTransformer

def embed_text_huggingface(model, text):
    """
    Embed the given text using OPENAI's text-embedding-ada-002 model.

    Args:
        text (str): The text to be embedded.

    Returns:
        numpy.ndarray: The embedded text representation.
    """
    #Sentences are encoded by calling model.encode()
    embedding = model.encode(text)

    return embedding

In [None]:
import faiss

def write_embeddings_to_vector_store(embeddings, index_name):
    """
    Write the embeddings to a vector store using FAISS.

    Args:
        embeddings (numpy.ndarray): The array of embeddings.
        index_name (str): The name of the index.

    Returns:
        None
    """
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create a flat index with inner product (IP) similarity
    index.add(embeddings)  # Add the embeddings to the index
    faiss.write_index(index, index_name)  # Save the index to disk

    return index

In [None]:
def vector_search(embeder, sentence, index, k):
    """
    Perform search retrival from FIASS

    Args:
        index: Import the index
        k: number of embedding to return

    Returns:

    """
    xq = np.array(embeder(sentence)).reshape(1,-1)
    D, I = index.search(xq, k)

    return D, I

In [None]:
def semantic_search_faiss(embeder, query_embedding, conversations, index, k, threshold = 0.9):
    """
    Perform semantic search using FAISS and retrieve relevant documents based on the query.

    Args:
        query_embedding (numpy.ndarray): The embedding of the user's query.
        index_file (str): The file path to the saved FAISS index.
        k (int): The number of most similar documents to retrieve.

    Returns:
        List[int]: The indices of the most relevant documents.
    """

    # Convert the query embedding to a numpy array
    query_np = np.array([query_embedding]).astype(np.float32)

    # Perform similarity search using FAISS
    D, indices = index.search(query_np, k)

    retrieved_indices = indices[0].tolist()

    print(retrieved_indices)

    # Ensure retrieved_indices are within the valid range
    retrieved_indices = [idx for idx in retrieved_indices if idx < len(conversations)]

    # Retrieve the corresponding conversations
    relevant_conversations = conversations.iloc[retrieved_indices]

    # relevant_conversations['Original_text_embedding'].apply(lambda x: _get_relevance_score(x, query_embedding))

    # _get_relevance_score(relevant_conversations['Original_text_embedding'], query_embedding)


    return relevant_conversations[relevant_conversations['Questions'].apply(lambda x: _get_relevance_score(embeder(x), query_embedding)) > threshold]


def _get_relevance_score(query_embedding, conversation_embedding):
    """
    Calculate the relevance score between the query embedding and a conversation embedding.
    Args:
        query_embedding (numpy.ndarray): The embedding of the query.
        conversation_embedding (numpy.ndarray): The embedding of the conversation.

    Returns:
        float: The relevance score between the embeddings.
    """

    # Calculate the cosine similarity between the embeddings
    similarity = np.dot(query_embedding, conversation_embedding) / (
        np.linalg.norm(query_embedding) * np.linalg.norm(conversation_embedding)
    )
    return similarity

## Implementation

In [None]:
import pandas as pd
import numpy as np

In [None]:
dataset = pd.read_csv("/content/Brix ai.csv")

dataset.dropna(inplace = True)
dataset.head()

Unnamed: 0,S/N,Questions,Answers
0,1.0,Where is Bells University of Technology?,Bells University of Technology is located at K...
1,2.0,"What programs and majors are offered, and are ...",Bells University of Technology offers a wide r...
3,4.0,What are the university's admission requiremen...,To apply for admission at Bells University of ...
4,5.0,What kind of campus resources are available?,"At Bells University of Technology, we offer va..."
5,6.0,What are the university's strengths and specia...,Bells University of Technology prides itself o...


In [None]:

embeddings = dataset['Questions'].apply(lambda x: embed_text_huggingface(model, x))

original_embeddings = np.vstack(embeddings.values)

original_embeddings.shape

(7, 384)

In [None]:
write_embeddings_to_vector_store(original_embeddings, "David.index")

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7aefbde96a90> >

In [None]:
index_file = "/content/David.index"  # Choose a file path to save the FAISS index
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
index = faiss.read_index(index_file)

# Step 6: Semantic Search using FAISS
query_text = "What are the addmission requirement for the university and deadline"
query_embedding = embed_text_huggingface(model, query_text)
k = 3  # Retrieve top-k most similar documents
retrieved = semantic_search_faiss(lambda x: embed_text_huggingface(model, x), query_embedding, dataset, index, k, 0.7)

retrieved['Answers'].values[0]

[2, 1, 3]


"To apply for admission at Bells University of Technology, you need to have taken the Unified Tertiary Matriculation Examination (UTME) for the current year or obtained the UTME Direct Entry Form if applicable. For undergraduate programs, you should have at least five O’ Level Credit passes in subjects relevant to your chosen course, including English Language and Mathematics. If you're awaiting results, you can still apply, but make sure the results will be available by September/October of the current year. Note that some programs in the Colleges of Engineering and Environmental Sciences are not listed in the UTME Admission Brochure. If you're interested in these programs, please contact us to obtain the Bells University of Technology Post-UTME application form. For more information, you can call us at (+234) 703-318-0831."

In [None]:
retrieved

Unnamed: 0,S/N,Questions,Answers
0,1.0,Where is Bells University of Technology?,Bells University of Technology is located at K...
1,2.0,"What programs and majors are offered, and are ...",Bells University of Technology offers a wide r...
3,4.0,What are the university's admission requiremen...,To apply for admission at Bells University of ...
