In [1]:
from openai import OpenAI
from config import OPENAI_API_KEY, AZURE_OPENAI_API_KEY,AZURE_API_ENDPOINT,API_VERSION,AZURE_MODEL_NAME, MODEL_NAME
from openai import AzureOpenAI
import pandas as pd
from transformers import GPT2TokenizerFast

# Azure OpenAI Client
azure_client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY, 
    api_version=API_VERSION,
    azure_endpoint = AZURE_API_ENDPOINT
  )  

# Helper Functions
def get_embedding(client,text):
    response = client.embeddings.create(
        input = text,  
        model="text-embedding-3-large", # replace with small model
    )
    return response.data[0].embedding
  
def cosine_similarity(embedding1, embedding2):
    dot_product = sum(embedding1[i] * embedding2[i] for i in range(len(embedding1)))
    magnitude1 = sum(x**2 for x in embedding1)**0.5
    magnitude2 = sum(x**2 for x in embedding2)**0.5
    return dot_product / (magnitude1 * magnitude2)
  
  
def answer_question(client,persona, question, example=""):
  completion = client.chat.completions.create(
    model=AZURE_MODEL_NAME,
    messages=[
      {"role": "system", "content": f"{persona}"},
      {"role": "user", "content": f"{question}"},
      {"role": "assistant", "content": f" You can use the following information as an example: {example}"},
    ],
  )
  return completion.choices[0].message.content


def add_embedding_to_db(collection_of_resumes_db,embedding,text,id):
    collection_of_resumes_db.add(
        embeddings = [embedding],
        documents = [text],
        ids = [id]
    )

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Calculate the number of tokens
resume_raw_data_df = pd.read_csv('Resume.csv')
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
resume_raw_data_df['number_tokens'] = resume_raw_data_df.Resume_str.apply(lambda x: len(tokenizer.encode(x)))
resume_raw_data_df.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (1314 > 1024). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 

In [3]:
# Chunking data into roughly around 300 tokens

# Using characters for chunking than tokens, as you can do list extraction based on character and not tokens
resume_latest_data_df = pd.DataFrame(columns=["ID", "Resume_str", "number_tokens"])

# resume_raw_data_df['ID']
max_tokens = 1200 # characters :  equivalent of 300 tokens 
counter = 0
new_index=0

for index, row in resume_raw_data_df.iterrows():
    counter = 0
    number_of_split = (row['number_tokens'])*4 // max_tokens # multiplying by 4 as each token is apporx 4 characters    
    for i in range(number_of_split+2):
        row_value = row['Resume_str'][counter: max_tokens+counter]
        row_token =len(tokenizer.encode(row_value))
        resume_latest_data_df.loc[new_index] = [resume_raw_data_df['ID'][index],row_value,row_token]
        counter=max_tokens+counter
        new_index+=1

resume_latest_data_df["index"] =[i for i in range(1, resume_latest_data_df.shape[0]+1)]
resume_latest_data_df.head()

Unnamed: 0,ID,Resume_str,number_tokens,index
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,318,1
1,16852973,"City , State Helps to develop policies...",217,2
2,16852973,. Advanced Medical Claims Analyst ...,275,3
3,16852973,"ng and Advertising, working on public relation...",366,4
4,16852973,"ainte Genevieve Senior High － City , St...",140,5


In [None]:
# EXTRA
# trial_Df = resume_latest_data_df.loc[resume_latest_data_df['ID'] == 18297650]
# trial_Df
# for index, row in trial_Df.iterrows():
#     print(row['Resume_str'])

In [4]:
# Generate Embeddings
resume_latest_data_df['embeddings']=resume_latest_data_df.Resume_str.apply(lambda x: get_embedding(azure_client,x))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
resume_latest_data_df.to_pickle('resume_data_pickle.pkl')    #to save the dataframe, df to 123.pkl


In [3]:
# TRY STORING PICKLE AND THEN DATABASE
df1 = pd.read_pickle('resume_data_pickle.pkl')
df1.head()

Unnamed: 0.1,Unnamed: 0,ID,Resume_str,number_tokens,index,embeddings
0,0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,318,1,"[-0.02123790979385376, -0.016999313607811928, ..."
1,1,16852973,"City , State Helps to develop policies...",217,2,"[-0.006911890115588903, -0.021714137867093086,..."
2,2,16852973,. Advanced Medical Claims Analyst ...,275,3,"[-0.028307681903243065, 0.041269928216934204, ..."
3,3,16852973,"ng and Advertising, working on public relation...",366,4,"[-0.020977875217795372, 0.0025092707946896553,..."
4,4,16852973,"ainte Genevieve Senior High － City , St...",140,5,"[-0.009055311791598797, 0.016410766169428825, ..."


In [4]:
resume_latest_data_df.to_csv("resume_latest_data_df.csv")

NameError: name 'resume_latest_data_df' is not defined

In [5]:
resume_latest_data_df = pd.read_csv('resume_data_embeddings.csv') #convert string stored embeddings back to list

In [4]:
from ast import literal_eval
resume_latest_data_df = pd.read_csv('resume_data_embeddings.csv', converters={'embeddings': literal_eval}) #convert string stored embeddings back to list

In [5]:
# EXTRA
# len(resume_latest_data_df)
resume_latest_data_df.head()



Unnamed: 0.1,Unnamed: 0,ID,Resume_str,number_tokens,index,embeddings
0,0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,318,1,"[-0.02123790979385376, -0.016999313607811928, ..."
1,1,16852973,"City , State Helps to develop policies...",217,2,"[-0.006911890115588903, -0.021714137867093086,..."
2,2,16852973,. Advanced Medical Claims Analyst ...,275,3,"[-0.028307681903243065, 0.041269928216934204, ..."
3,3,16852973,"ng and Advertising, working on public relation...",366,4,"[-0.020977875217795372, 0.0025092707946896553,..."
4,4,16852973,"ainte Genevieve Senior High － City , St...",140,5,"[-0.009055311791598797, 0.016410766169428825, ..."


In [9]:
# Below 3 lines added for older python version
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

chroma_client = chromadb.PersistentClient(
    path="vector_database",
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

collection_of_resumes_db = chroma_client.get_or_create_collection(name="resume_vector")

# student_info = """
# Alex, a 19-year-old computer science sophomore with a 3.7 GPA
# """
# colour_info = """
# Alex's favourite color is Red
# """
# embedding_1 = get_embedding(azure_client,student_info)

# embedding_2 = get_embedding(azure_client,colour_info)

# collection_of_resumes_db.add(
#     embeddings = [embedding_1],
#     documents = [student_info],
#     ids = ["id1"]
# )


# collection = chroma_client.get_or_create_collection(name="resume_vector")
# col_exists = True
# try:
#     coll =  .get_collection(name="resume_vector")
# except:
#     col_exists = False


print(collection_of_resumes_db.count())
for index, row in df1.iterrows():
    collection_of_resumes_db.add(
        embeddings = [row["embeddings"]],
        documents = [row["Resume_str"]],
        ids = [str(row["index"])]
    )
print(collection_of_resumes_db.count())

# resume_latest_data_df.apply(lambda x: add_embedding_to_db(collection_of_resumes_db, x.embeddings, x.Resume_str, str(x.index)), axis=1)

0
15841


In [None]:
# EXTRA
# for index, row in resume_latest_data_df.iterrows():
#     if row["index"]==10 or row["index"]==11:
#         print(row['Resume_str'])

In [14]:
# EXTRA
# collection_of_resumes_db.peek()

# all_documents = collection_of_resumes_db.get()['documents']
# len(all_documents)

15844

In [None]:
question = """Does any candidate already has Python coding experience? 
Please summarise their experience and provide their resume id. 
Also provide ids of other matching resumes"""

questions_embedding = get_embedding(azure_client, question)

results = collection_of_resumes_db.query(
    query_embeddings=[questions_embedding],
    n_results=5
)

best_result_document = results.get('documents')[0][0]

best_result_index = int(results.get('ids')[0][0])
best_result_id = resume_latest_data_df.loc[(resume_latest_data_df.index==best_result_index)]['ID']
best_result_id = int(best_result_id.iloc[0])

list_of_resume_ids = []
for i in range(1,5):
    other_result_index = int(results.get('ids')[0][i])
    other_result_id = resume_latest_data_df.loc[(resume_latest_data_df.index==other_result_index)]['ID']
    list_of_resume_ids.append((other_result_id.iloc[0]))

string_of_resume_ids = ','.join(map(str, list_of_resume_ids))
best_result = best_result_document + f". The id of this specific resume is {best_result_id}. Also, other matching resumes are {string_of_resume_ids}"
print(best_result)


In [6]:
persona = """
You are expert in reviewing resumes and identifying the best candidates for a role. You are an advisor to the HR team, especially in answering any queries
"""
persona=persona+best_result
response = answer_question(client=azure_client, question=question, persona=persona)
print(response)

Yes, the candidate with the resume ID 50328713 has Python coding experience. Here's a summary of their experience:

- The candidate has listed Python as one of their skills, indicating they are conversant with the language.
- They have experience with machine learning tools and libraries such as Scikit-learn, Pandas, Seaborn, matplotlib, and basic working knowledge of TensorFlow.
- They built a machine learning model using the XGBoost algorithm that achieved a 77.5% accuracy rate in the Kaggle Titanic challenge. This demonstrates practical application of their Python skills in a project setting.

The resume ID for this candidate is 50328713.

The other matching resumes you asked for are:

- 32985311
- 12011623
- 14871762
- 11813872

Please note these ids are given without context or content; whether they have Python experience is not determinable from the information provided here. If their resumes also list Python as a skill or describe Python-based projects, those candidates would ha

# TODO
- Re-arrange the code and the function order
- Update the name of the columns in the dataframe
- Show both approaches ie with and without vector DB (without DB shows more mathematics)
- Decide what to do with csvs