In [43]:
%pip install langchain --quiet
%pip install numpy --quiet
%pip install boto3 --quiet
%pip install PyPDF2 --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [44]:
import os
import json
import boto3
from langchain.embeddings import BedrockEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.llms.bedrock import Bedrock
from langchain.embeddings import BedrockEmbeddings
from numpy import dot
from numpy.linalg import norm
from PyPDF2 import PdfReader

In [45]:
## testing 

session = boto3.Session(
    profile_name=os.environ.get("BWB_PROFILE_NAME")
) #sets the profile name to use for AWS credentials

bedrock = session.client(
    service_name='bedrock-runtime', #creates a Bedrock client
    region_name=os.environ.get("BWB_REGION_NAME"),
    endpoint_url=os.environ.get("BWB_ENDPOINT_URL")
) 
bedrock_model_id = "ai21.j2-ultra-v1" #set the foundation model

prompt = "What is the largest city in New Hampshire?" #the prompt to send to the model

body = json.dumps({
    "prompt": prompt, #AI21
    "maxTokens": 1024, 
    "temperature": 0, 
    "topP": 0.5, 
    "stopSequences": [], 
    "countPenalty": {"scale": 0 }, 
    "presencePenalty": {"scale": 0 }, 
    "frequencyPenalty": {"scale": 0 }
}) #build the request payload

response = bedrock.invoke_model(body=body, modelId=bedrock_model_id, accept='application/json', contentType='application/json') #send the payload to Bedrock
response_body = json.loads(response.get('body').read()) # read the response

response_text = response_body.get("completions")[0].get("data").get("text") #extract the text from the JSON response

print(response_text)


Manchester is the largest and most populous city in New Hampshire.


In [46]:
## create LLm
def get_llm():
    
    model_kwargs = { #AI21
        "maxTokens": 2000, 
        "temperature": 0, 
        "topP": 0.5, 
        "stopSequences": [], 
        "countPenalty": {"scale": 0 }, 
        "presencePenalty": {"scale": 0 }, 
        "frequencyPenalty": {"scale": 0 } 
    }
    
    llm = Bedrock(
        credentials_profile_name=os.environ.get("BWB_PROFILE_NAME"), #sets the profile name to use for AWS credentials (if not the default)
        region_name=os.environ.get("BWB_REGION_NAME"), #sets the region name (if not the default)
        endpoint_url=os.environ.get("BWB_ENDPOINT_URL"), #sets the endpoint URL (if necessary)
        model_id="ai21.j2-ultra-v1", #set the foundation model
        model_kwargs=model_kwargs) #configure the properties for Claude
    
    return llm

In [47]:
## gets looped and each resume path differ by the ending

def get_index(file): #creates and returns an in-memory vector store to be used in the application
    
    embeddings = BedrockEmbeddings(
        credentials_profile_name=os.environ.get("BWB_PROFILE_NAME"), #sets the profile name to use for AWS credentials (if not the default)
        region_name=os.environ.get("BWB_REGION_NAME"), #sets the region name (if not the default)
        endpoint_url=os.environ.get("BWB_ENDPOINT_URL"), #sets the endpoint URL (if necessary)
    ) #create a Titan Embeddings client

    loader = PdfReader(file) #load the pdf file
    
    text_splitter = RecursiveCharacterTextSplitter( #create a text splitter
        separators=["\n\n", "•", ".", " "], #split chunks at (1) paragraph, (2) line, (3) sentence, or (4) word, in that order
        chunk_size=1000, #divide into 1000-character chunks using the separators above
        chunk_overlap=100 #number of characters that can overlap with previous chunk
    )
    
    index_creator = VectorstoreIndexCreator( #create a vector store factory
        vectorstore_cls=FAISS, #use an in-memory vector store for demo purposes
        embedding=embeddings, #use Titan embeddings
        text_splitter=text_splitter, #use the recursive text splitter
    )
    
    index_from_loader = index_creator.from_loaders([loader]) #create an vector store index from the loaded PDF
    
    return index_from_loader #return the index to be cached by the client app

In [48]:
def get_rag_response(index, question): #rag client function
    
    llm = get_llm()
    
    response_text = index.query(question=question, llm=llm) #search against the in-memory index, stuff results into a prompt and send to the llm
    
    return response_text

In [49]:
## main function

belc = BedrockEmbeddings()

class EmbedItem:
    def __init__(self, text, position):
        self.text = text
        self.embedding = belc.embed_query(text)
        self.position = position

class ComparisonResult:
    def __init__(self, text, similarity, position):
        self.text = text
        self.similarity = similarity
        self.position = position
        
def calculate_similarity(a, b): #See Cosine Similarity: https://en.wikipedia.org/wiki/Cosine_similarity
    return dot(a, b) / (norm(a) * norm(b))

def run(files):
    candidates = []
    int = 0
    for file in files:
        input_text = "Extract the technical skills which is only supported by the experiences listed in the resume, and the language, candidate name, sex, country and education of the resume. And filter out the skills without being supported by experiences."
        index = get_index(file)
        response_content = get_rag_response(index=index, question=input_text)
        candidates.append(EmbedItem(response_content, i+1))
        i+=1
    return candidates

def compare(job_description, items):
    result = []
    e1 = EmbedItem(job_description, 0)
    cosine_comparisons = []
    for e2 in items:
        similarity_score = calculate_similarity(e1.embedding, e2.embedding)
        cosine_comparisons.append(ComparisonResult(e2.text, similarity_score, e2.position)) #save the comparisons to a list
    cosine_comparisons.sort(key=lambda x: x.similarity, reverse=True) # list the closest matches first
    for c in cosine_comparisons:
        result.append("%.6f" % c.similarity, "\t", ("resume"+c.position))
    return result