In [2]:
from openai import OpenAI
from config import OPENAI_API_KEY, AZURE_OPENAI_API_KEY,AZURE_API_ENDPOINT,API_VERSION,AZURE_MODEL_NAME, MODEL_NAME
from openai import AzureOpenAI
import pandas as pd
from transformers import GPT2TokenizerFast

# Azure OpenAI Client
client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY, 
    api_version=API_VERSION,
    azure_endpoint = AZURE_API_ENDPOINT
  )  

# Helper Functions
def get_embedding(text):
    response = client.embeddings.create(
        input = text,  
        model="text-embedding-3-large", # replace with small model
    )
    return response.data[0].embedding
  
def cosine_similarity(embedding1, embedding2):
    dot_product = sum(embedding1[i] * embedding2[i] for i in range(len(embedding1)))
    magnitude1 = sum(x**2 for x in embedding1)**0.5
    magnitude2 = sum(x**2 for x in embedding2)**0.5
    return dot_product / (magnitude1 * magnitude2)
  
  
def answer_question(persona, question, example=""):
  completion = client.chat.completions.create(
    model=AZURE_MODEL_NAME,
    messages=[
      {"role": "system", "content": f"{persona}"},
      {"role": "user", "content": f"{question}"},
      {"role": "assistant", "content": f" You can use the following information as an example: {example}"},
    ],
  )
  return completion.choices[0].message.content

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [6]:
# Calculate the number of tokens
resume_raw_data_df = pd.read_csv('Resume.csv')
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
resume_raw_data_df['number_tokens'] = resume_raw_data_df.Resume_str.apply(lambda x: len(tokenizer.encode(x)))
resume_raw_data_df.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (1314 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,ID,Resume_str,Resume_html,Category,number_tokens
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,1314
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,1314
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,1759
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,676
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,1892


In [12]:
# Chunking data into roughly around 300 tokens

resume_latest_data_df = pd.DataFrame(columns=["ID", "Resume_str", "number_tokens"])

# resume_raw_data_df['ID']
max_tokens = 1200 # equivalent of 300 tokens
counter = 0
new_index=0

for index, row in resume_raw_data_df.iterrows():
    number_of_split = row['number_tokens'] // max_tokens
    for i in range(number_of_split):
        row_value = row['Resume_str'][counter: max_tokens+counter]
        row_token =len(tokenizer.encode(row_value))
        resume_latest_data_df.loc[new_index] = [resume_raw_data_df['ID'][index],row_value,row_token]
        counter=max_tokens+counter
        new_index+=1
    

resume_latest_data_df["index"] =[i for i in range(1, resume_latest_data_df.shape[0]+1)]
resume_latest_data_df.head()

Unnamed: 0,ID,Resume_str,number_tokens,index
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,318,1
1,22323967,"Company Name － City , State Post...",253,2
2,33176873,"or in the healthcare field, I managed the fron...",266,3
3,17812897,"r Peak volume, which consisted of over 1800 to...",203,4
4,11592605,"erformance Evaluation Notification, NJIT - Ove...",140,5


In [13]:
# Generate Embeddings
resume_latest_data_df['embeddings']=resume_latest_data_df.Resume_str.apply(lambda x: get_embedding(x))
resume_latest_data_df.head()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,ID,Resume_str,number_tokens,index,embeddings
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,318,1,"[-0.02123790979385376, -0.016999313607811928, ..."
1,22323967,"Company Name － City , State Post...",253,2,"[-0.01917235367000103, -0.013079637661576271, ..."
2,33176873,"or in the healthcare field, I managed the fron...",266,3,"[-0.0013173794141039252, 0.010919589549303055,..."
3,17812897,"r Peak volume, which consisted of over 1800 to...",203,4,"[-0.03311477601528168, -0.005292537156492472, ..."
4,11592605,"erformance Evaluation Notification, NJIT - Ove...",140,5,"[-0.019211573526263237, 0.001668125856667757, ..."


# TODO
- Update the name of the columns in the dataframe