# LangChain Version of GPT Resume Seach Tool

## Installs

In [None]:
!python3 -m pip install --upgrade langchain faiss-cpu chromadb openai tiktoken

In [None]:
!pip install pypdf

## Imports and Google Drive Mount

In [None]:
# set up google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import getpass
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

In [None]:
import argparse, json, time, datetime, openai
from pathlib import Path

In [None]:
def set_open_ai_key(env_path=None):
  #import json, os
  #from pathlib import Path
  try:
    with open(env_path, "r") as f:
        env_vars = json.load(f)
    os.environ["OPENAI_API_KEY"] = env_vars["OPENAI_API_KEY"]
    openai.api_key = os.environ["OPENAI_API_KEY"]
    #os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')
    openai.Model.list() #test a random command on the openai API
    return True
  except Exception as e:
    print(e)
  return False

## Setup

In [None]:
# setup API key
openai_env_path, openai.api_key = None, None
cwd = Path.cwd()
# resume_path = cwd / "Resumes"
# resume_path.mkdir(exist_ok=True)

openai_env_path = cwd/ "drive/MyDrive/Colab Notebooks/openai.env"
set_open_ai_key(openai_env_path)

## Load and Parse Resume Books

In [None]:
#cwd = Path.cwd()
#output_path = cwd / "drive/MyDrive/Colab Notebooks/Output"

In [None]:
#Set up document loader for pdf resume books
embeddings = OpenAIEmbeddings()
resume_path1 = "/content/drive/MyDrive/Colab Notebooks/resume_books/resume_book_2022.pdf"
resume_path2 = "/content/drive/MyDrive/Colab Notebooks/resume_books/GDI2022ResumeBook.pdf"

In [None]:
def load_resumes(path, skip_pages):
  loader = PyPDFLoader(path)
  pages = loader.load_and_split()
  pages_clean = pages[skip_pages:]
  return pages_clean

In [None]:
# actual resumes start on page 2 of this pdf compilation
r1 = load_resumes(resume_path1, 2)

In [None]:
# actual resumes start on page 1 of this pdf compilation
r2 = load_resumes(resume_path2, 1)

In [None]:
r1[0]

In [None]:
r1[0].metadata["name"] = "YIN FU"

In [None]:
r1[1]

In [None]:
r1[0]

In [None]:
# combine resume books
resumes = r1+r2

In [None]:
len(resumes)

In [None]:
# start with one resume book
#loader = PyPDFLoader(resume_path1)
#pages = loader.load_and_split()

In [None]:
# actual resumes start on page 2 of this pdf compilation
#resumes = pages[2:]

## Chunk Resumes

In [None]:
# split the documents into chunks
#text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
#texts = text_splitter.split_documents(resumes)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set small chunk size, just to test.
    chunk_size = 500,
    chunk_overlap  = 0,
    length_function = len,
)

In [None]:
texts = text_splitter.split_documents(resumes)
print(texts[0])

## Use Vector Stores to create embeddings and preform similarity search

to do: figure out which vector store is best to use - FAISS vs Chroma

In [None]:
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma

### Using FAISS

In [None]:
faiss_index = FAISS.from_documents(texts, OpenAIEmbeddings())

In [None]:
docs = faiss_index.similarity_search("knows statistics", k=8)

In [None]:
for doc in docs:
    #print(str(doc.metadata["page"]) + ":", doc.page_content)
    print(str(doc.metadata["page"]) + ":", doc.page_content[:100] + ":", str(doc.metadata["source"]))

In [None]:
docs[5]

## Call OpenAI to use GPT to answer questions based on Resumes

In [None]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

In [None]:
#question = "Who knows statistics?"
question = "which resumes have statistics skills?"

In [None]:
# test different prompts
context = " Make a table of results from the documents given with columns indicating page and source."
#context = " Make a list of the page numbers from the documents given, using the 'page' metadata, remove all duplicates from the list"
#context = " make a table using the document metadata table with columns: page, source"
#context = " Make a list from the documents given."
#context = " return the document metadata"

In [None]:
# make output in json with the following source, page, knows statistics
#context = " Generate a list of resulting resumes with page numbers and source file information, provide them in JSON format with the following keys: page_id, source_file"
#context = " Generate a list of resulting resumes with their metadata fields, provide them in JSON format with the following keys: page_id, source_file"
context = " Provide resulting resumes in JSON format with the following keys from the document metadata fields: page_id, source_file"

In [None]:
query = question + context
print(query)

In [None]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")

In [None]:
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

In [None]:
#best one?
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

### Using Chroma

In [None]:
# chain = load_qa_with_sources_chain(llm=OpenAI(), chain_type="stuff")
# chain({"input_documents": docs, "question": query}, return_source_documents=True)

In [None]:
# db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)

# retriever = db.as_retriever()
# retriever.search_kwargs['distance_metric'] = 'cos'
# retriever.search_kwargs['k'] = 4

# qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=False)

# # What was the restaurant the group was talking about called?
# query = input("Enter query:")

# # The Hungry Lobster
# ans = qa({"query": query})

# print(ans)

In [None]:
# def qa(file, query, chain_type, k):
#     # load document
#     loader = PyPDFLoader(file)
#     documents = loader.load()
#     # split the documents into chunks
#     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
#     texts = text_splitter.split_documents(documents)
#     # select which embeddings we want to use
#     embeddings = OpenAIEmbeddings()
#     # create the vectorestore to use as the index
#     db = Chroma.from_documents(texts, embeddings)
#     # expose this index in a retriever interface
#     retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
#     # create a chain to answer questions
#     qa = RetrievalQA.from_chain_type(
#         llm=OpenAI(), chain_type=chain_type, retriever=retriever, return_source_documents=True)
#     result = qa({"query": query})
#     print(result['result'])
#     return result

In [None]:
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 8})

In [None]:
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=retriever, return_source_documents=False)
result = qa({"query": query})
print(result['result'])

In [None]:
query

In [None]:
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=retriever, return_source_documents=False)
result = qa({"query": query})
print(result['result'])

In [None]:
# try prompt templates?