In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install langchain-community pypdf sentence_transformers faiss-cpu

Collecting langchain-community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.9 (from langchain-community)
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.23 (from langchain-community)
  Downloading langchain_core-0.2.24-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain-community)
  Downloading langsmith-0.1.93-py3-none-any.whl.metadata (13 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (f

In [None]:
# importing necessary libraries

from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
import os
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


main_docs = []


files_path = "/content/drive/MyDrive/LLM_Assignment/files/data/data"

if os.path.exists(files_path):
  for folder in os.listdir(files_path):
    print("in folder:", folder)
    for file in os.listdir(files_path+'/'+folder):
      loader = PyPDFLoader(files_path+'/'+folder+'/'+file)
      documents = loader.load()
      for i in range(1, len(documents)):
        documents[0].page_content += documents[i].page_content

      documents = [documents[0]]
      text_splitter = RecursiveCharacterTextSplitter(
          chunk_size=1000,
          chunk_overlap=200,
          length_function=len
      )
      docs = text_splitter.split_documents(documents)
      main_docs.extend(docs)


embeddings = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")


def create_faiss_index(docs, embeddings):
  """Creates a FAISS index from a list of documents and an embedding model.

  Args:
    docs: A list of documents.
    embeddings: A SentenceTransformer model for generating embeddings.

  Returns:
    A FAISS index.
  """

  # generating embeddings for the documents
  doc_embeddings = embeddings.encode([doc.page_content for doc in docs])

  # creating a FAISS index
  d = doc_embeddings.shape[1]

  nlist = 10  # hno of cells
  quantizer = faiss.IndexFlatL2(d)
  index = faiss.IndexIVFFlat(quantizer, d, nlist)
  index.train(doc_embeddings)
  index.add(doc_embeddings)

  return index

#  FAISS index
index = create_faiss_index(main_docs, embeddings)

# Saving the FAISS index to a file
faiss.write_index(index, "faiss_index.index")



  from tqdm.autonotebook import tqdm, trange


in folder: ENGINEERING
in folder: SALES
in folder: FITNESS
in folder: INFORMATION-TECHNOLOGY
in folder: HEALTHCARE
in folder: PUBLIC-RELATIONS
in folder: DIGITAL-MEDIA
in folder: HR
in folder: TEACHER
in folder: FINANCE
in folder: ARTS
in folder: AUTOMOBILE
in folder: AVIATION
in folder: CONSULTANT
in folder: CONSTRUCTION
in folder: BUSINESS-DEVELOPMENT
in folder: CHEF
in folder: DESIGNER
in folder: BPO
in folder: BANKING
in folder: ADVOCATE
in folder: ACCOUNTANT
in folder: AGRICULTURE
in folder: APPAREL


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
main_docs

[Document(metadata={'source': '/content/drive/MyDrive/LLM_Assignment/files/data/data/ENGINEERING/14049846.pdf', 'page': 0}, page_content='ENGINEERING INTERN\nSummary\nlooking for an opportunity as an engineer or related position which offers key participation, team-oriented tasks, immediate challenges and career\nopportunities in a reputed organization which will help me deliver my best and upgrade my skills in engineering and meet the demands of the\norganization. To utilize my technical skills for achieving the target and developing the best performance in the organization. I would like to\nimplement my innovative ideas, skills and creativity for accomplishing projects.\nSkills\nKnow-your-consumer (KYC)\nStatistical process control\nCost reduction and avoidance\nSystems Engineering\nSystems Engineering management\nSoftware systems engineering\nQuality control\nFailure analysis\nTools of operations research\nMS Office\nAuto CAD\nExperience\n06/2014\n \nto \n06/2014\nEngineering Intern

In [None]:
# loading the index and retrieving the top 10 profiles

index = faiss.read_index("faiss_index.index")


job_description = """
We are looking for a skilled UI Developer to join our dynamic team.
The ideal candidate will have a strong background in front-end development,
with proficiency in HTML, CSS, JavaScript, and modern frameworks like React or Angular.
Your primary responsibility will be to create visually appealing and user-friendly web interfaces that enhance user experience and align with our brand guidelines
"""

# input query vector embedding
xq = embeddings.encode([job_description])
D, I = index.search(xq, k=10)  # search


# Accessing multiple documents using list comprehension
result_docs = [main_docs[i] for i in I[0]]
for doc in result_docs:
  print(doc.metadata['source'])

/content/drive/MyDrive/LLM_Assignment/files/data/data/INFORMATION-TECHNOLOGY/83816738.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/HR/18297650.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/ENGINEERING/28630325.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/DESIGNER/37664296.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/HR/18297650.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/INFORMATION-TECHNOLOGY/46260230.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/BANKING/71422121.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/HR/18297650.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/INFORMATION-TECHNOLOGY/13405733.pdf
/content/drive/MyDrive/LLM_Assignment/files/data/data/INFORMATION-TECHNOLOGY/16899268.pdf


In [None]:
D

array([[0.7366694 , 0.81531   , 0.83211493, 0.8358056 , 0.8713792 ,
        0.90797246, 0.9251096 , 0.9397651 , 0.94202435, 0.9471934 ]],
      dtype=float32)