# Installing Required Packages

In [1]:
r'''requirements
torch
pymongo
faiss-gpu
sentence-transformers
'''
!pip install sentence-transformers torch



In [2]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


# Testing the sentence transformer model

In [4]:
from sentence_transformers import SentenceTransformer
import torch

# Load the model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

# Check if GPU is available and move the model to GPU
if torch.cuda.is_available():
    device = 'cuda'
    model = model.to(device)
    print("Model moved to GPU")
else:
    device = 'cpu'
    print("Running on CPU")

# Example input
sentences = ["This is a test sentence.", "How can I use GPU with Sentence Transformers?"]

# Move inputs to GPU if applicable
embeddings = model.encode(sentences, device=device)

print("Embeddings Shape:", embeddings.shape)

Model moved to GPU
Embeddings Shape: (2, 384)


# Accessing the Databse

In [20]:
import numpy as np

In [9]:
from pymongo import MongoClient
from google.colab import userdata

# Connect to DB
client = MongoClient(userdata.get('MongoURI'))
# Selecting my database
db = client.Cluster0
collection = db["academics"]

In [28]:
ids = []
vectors = []
for document in collection.find():
  vector = np.array(document['vector'])
  ids.append(document['_id'])
  vectors.append(vector)

In [40]:
vectors = np.array(vectors, dtype='float32')
vectors.shape

(108, 384)

In [64]:
query = "Tell me about credits and courses for Electrical Engineering"
query_vector = model.encode(query)

# Implementing Faiss search

In [37]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [38]:
r'''I have vectors ready along with ids, Now I need to similarity search
and then get back the ObjecIds of the documents'''
import faiss

In [58]:
# Normalize vectors (important for cosine similarity)
faiss.normalize_L2(vectors)

# 3. Build a FAISS index
dimension = vectors.shape[1]  # Assuming vectors are of uniform length
index = faiss.IndexFlatIP(dimension)  # Cosine similarity
index.add(vectors)

# Normalize the query vector
faiss.normalize_L2(query_vector.reshape(1, -1))

# Conduct similarity search
k = 5  # Number of nearest neighbors to retrieve
distances, indices = index.search(query_vector.reshape(1, -1), k)

# Map indices back to IDs
result_ids = [ids[i] for i in indices[0]]


In [59]:
result_ids

[ObjectId('66fda043565b107d6c473f3a'),
 ObjectId('66fda043565b107d6c473f3c'),
 ObjectId('66fda042565b107d6c473f39'),
 ObjectId('66fda042565b107d6c473f38'),
 ObjectId('66fda048565b107d6c473f48')]

In [60]:
Context = ""
pages = []
cursor = collection.find({"_id": {"$in": result_ids}})
# Process the cursor as before (iterate or convert to list)
for document in cursor:
    Context += document['text']
    pages.append(document['page_number'])

pages = set(pages)

In [61]:
pages

{30, 31, 32, 33, 40}

In [62]:
Context

'Semester 7 Semester 4\nS.No Course Code Course Description L-T-P-S-C\nS.No. Course Code Course Description L-T-P-S-C\n. 1 CS202 Programming Paradigms and Pragmatics 3 -1-2-6-4\n1 CP302 Capstone Project I 0-0-6-3-3\n2 CS204 Computer Architecture 3-1-2-6-4\nELECTIVE COURSES\n3 MA202 Probability and Statistics 3-1-0-5-3\n2 An English Language/Literature elective course in\n4 Human Geography and Societal Needs/ [1-1/3-4-11/3-3]/\nHSXXX either 7th or 8th sem for students who had 3 Credits HS202 / BM101\nBiology for Engineers [3-1-0-5-3]\n“English Language Skills” in 1st Semester\n5 NCIV/NOIV/NSIV NCC/NSO/NSS 0 -0-2-1-1\n3 BMXXX/MAXXX\n6 3-1-0-5-3/\n/CYXXX/PHXX Science Maths Elective I 3 Credits HS201/ GE108 Economics/Basic Electronics\n(2-2/3-2-13/3-3)\nX\n7 Tinkering Lab/ Introduction to [0 -0-3-3/2-1.5]/\n4 CEXXX Program Elective I 3 Credits GE107/ GE109\nEngineering Products [0 -0-2-1-1]\n5 Anyextra credits taken under HS\nXXXXX 3 Credits TOTAL CREDITS 19 or 19.5\nElective/Program Elect

In [50]:
len(Context)

6142

In [65]:
r'''Putting them all together in a function'''
def FaissSearch(query_vector, vectors, k=10):
  faiss.normalize_L2(vectors)
  # 3. Build a FAISS index
  dimension = vectors.shape[1]  # Assuming vectors are of uniform length
  index = faiss.IndexFlatIP(dimension)  # Cosine similarity
  index.add(vectors)

  # Normalize the query vector
  faiss.normalize_L2(query_vector.reshape(1, -1))

  # Conduct similarity search
  # k Number of nearest neighbors to retrieve
  distances, indices = index.search(query_vector.reshape(1, -1), k)

  # Map indices back to IDs
  result_ids = [ids[i] for i in indices[0]]

  Context = ""
  pages = []
  cursor = collection.find({"_id": {"$in": result_ids}})
  # Process the cursor as before (iterate or convert to list)
  for document in cursor:
    Context += document['text']
    pages.append(document['page_number'])
  pages = set(pages)

  return Context, pages

In [66]:
Context, pages = FaissSearch(query_vector, vectors)

In [67]:
Context

'Semester 7\nSem ester 4\nS.No. Course Code Course Description L-T-P-S-C\nS.No. Course Code Course Description L-T-P-S-C\n1 CP302 Capstone Project I 0-0-6-3-3\n1 CE301 Structural Analysis 2-2/3-0-3-2\nELECTIVE COURSES\n2 CE302 Pipe and Open Channel Hydraulics 2-2/3-2-4-3\n2 An English Language/Literature elective course in\n3 CE303 Soil Mechanics 2-2/3-2-4-3\nHSXXX either 7th or 8th sem for students who had 3 Credits\n4 MA202 Probability and Statistics 3-1-0-5-3\n“English Language Skills” in 1st Semester\n5 Human Geography and Societal Needs / [1-1/3-4-11/3-3] /\n3 BMXXX/MAXXX HS202 / BM101\nBiology for Engineers [3-1-0-5- 3]\n/CYXXX/PHXX Science Maths Elective I 3 Credits\n6 NCIV/NOIV/NSIV NCC/NSO/NSS 0 -0-2-1-1\nX\n7 3-1-0-5-3 /\n4 CHXXX Program Elective I 3 Credits HS201 / GE108 Economics/Basic Electronics\n(2-2/3-2-13/3-3)\n5 Any extra credits taken under HS\nXXXXX 3 Credits 8 Tinkering Lab / Introduction to [0 -0-3-3/2-1.5] /\nElective/Program Elective/Science Maths Elective GE107