In [1]:
%pip install langchain --quiet
%pip install numpy --quiet
%pip install boto3 --quiet
%pip install faiss-gpu --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import boto3
from langchain.embeddings import BedrockEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.llms.bedrock import Bedrock
from langchain.embeddings import BedrockEmbeddings
from numpy import dot
from numpy.linalg import norm
from langchain.document_loaders import PyPDFLoader

In [3]:
## customize and create LLM

def get_llm():
    
    model_kwargs = { #AI21
        "maxTokens": 2000, 
        "temperature": 0, 
        "topP": 0.5, 
        "stopSequences": [], 
        "countPenalty": {"scale": 0 }, 
        "presencePenalty": {"scale": 0 }, 
        "frequencyPenalty": {"scale": 0 } 
    }
    
    llm = Bedrock(
        credentials_profile_name=os.environ.get("BWB_PROFILE_NAME"), #sets the profile name to use for AWS credentials 
        region_name=os.environ.get("BWB_REGION_NAME"), #sets the region name 
        endpoint_url=os.environ.get("BWB_ENDPOINT_URL"), #sets the endpoint URL 
        model_id="ai21.j2-ultra-v1", #set the foundation model
        model_kwargs=model_kwargs) #configure the properties for Claude
    
    return llm

In [4]:
## create the helper that returns an in-memory vector index to be used in application

def get_index(file):
    
    embeddings = BedrockEmbeddings(
        credentials_profile_name=os.environ.get("BWB_PROFILE_NAME"), 
        region_name=os.environ.get("BWB_REGION_NAME"), 
        endpoint_url=os.environ.get("BWB_ENDPOINT_URL"), 
    ) #create a Titan Embeddings client

    loader = PyPDFLoader(file) #load the pdf file
    
    text_splitter = RecursiveCharacterTextSplitter( #create a text splitter
        separators=["\n\n", "•", ".", " "], #split chunks at (1) paragraph, (2) line, (3) sentence, or (4) word, in that order
        chunk_size=1000, #divide into 1000-character chunks using the separators above
        chunk_overlap=100 #number of characters that can overlap with previous chunk
    )
    
    index_creator = VectorstoreIndexCreator( #create a vector store factory
        vectorstore_cls=FAISS, #use an in-memory vector store for demo purposes
        embedding=embeddings, #use Titan embeddings
        text_splitter=text_splitter, #use the recursive text splitter
    )
    
    index_from_loader = index_creator.from_loaders([loader]) #create an vector store index from the loaded PDF
    return index_from_loader #return the index to be cached by the client app

In [5]:
## create the helper that receive result from LLM

def get_rag_response(index, question): #rag client function
    llm = get_llm()
    response_text = index.query(question=question, llm=llm)
    return response_text

In [6]:
## Helpful classes

belc = BedrockEmbeddings()

class EmbedItem:
    def __init__(self, text, position):
        self.text = text
        self.embedding = belc.embed_query(text)
        self.position = position

class ComparisonResult:
    def __init__(self, text, similarity, position):
        self.text = text
        self.similarity = similarity
        self.position = position
        
def calculate_similarity(a, b): 
    return dot(a, b) / (norm(a) * norm(b))

In [7]:
## Main function that extracts key information from uploaded resume

def run(n):
    
    candidates = []
    input_text = "Extract the technical skills which is only supported by the experiences listed in the resume, and the language, candidate name, sex, country and education of the resume. And filter out the skills without being supported by experiences."
    
    for i in range(1,n+1,1):
        index = get_index("~/home/jovyan/GenerativeAIBot/Resume"+str(i)+".pdf")
        response_content = get_rag_response(index=index, question=input_text)
        candidates.append(EmbedItem(response_content, i))
    
    return candidates

In [8]:
## returns the result of similarity match between resume and inputed job description

def compare(job_description, items):
    result = []
    e1 = EmbedItem(job_description, 0)
    cosine_comparisons = []
    
    for e2 in items:
        similarity_score = calculate_similarity(e1.embedding, e2.embedding)
        cosine_comparisons.append(ComparisonResult(e2.text, similarity_score, e2.position)) #save the comparisons to a list
    
    cosine_comparisons.sort(key=lambda x: x.similarity, reverse=True) # list the closest matches first
    for c in cosine_comparisons:
        result.append(str(c.similarity)+(" - resume"+str(c.position)))
    
    return result

In [9]:
# Manual testing of backend library    
    
input_text = "knows java and have experience as a quality analyte"
items = run(3)
response_content = compare(job_description=input_text, items=items)
for line in response_content:
    print(line)

0.4980325748478376 - resume3
0.43871089659190554 - resume1
0.27346247389725076 - resume2
