In [3]:
# Libraries to use
import os
import pandas as pd
#import time
#import json
#import joblib
import pdfplumber
from dotenv import load_dotenv
from openai import OpenAI
from datetime import datetime
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore")

# Load environment variables from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [29]:
# This is the System Prompt used as a prerequisit for generating a response.
System_Prompt_summary = '''
Task: Analyse Job Description and Employee resume and give 5 points summary as why this candidate is suitable for the given job role.

Objective: As a seasoned resume analysis expert, your responsibility is to examine resumes and examine Job Description and give 5 points summary as why this candidate is suitable for the given job role.

Output Format: Provide the results in JSON format using the structure below: Do not give any other information in the output.
  {
    "Name": "Candidate's Full Name",
    "Summary":"5 Points Summary"
  }
 
Example Output:
  {
    "Name": "Shriraj Pathak",
    "Summary":"5 Points Summary"
  }
'''

def get_text(pdf_path: str)-> str:
    '''
    Function to Extract Text and Tables from the Resume PDF's
    '''
    if not pdf_path.endswith(".pdf"):
        raise ValueError("Invalid file format: Please provide a valid .pdf file.")

    content = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted_text = page.extract_text()
                content += extracted_text

                # Extracting table
                try:
                    tables = page.extract_tables()
                    for idx, table in enumerate(tables):
                        content += f"\nTable #{idx + 1}:\n"
                        for row in table:
                            content += ', '.join([str(_) for _ in row]) + '\n'
                except Exception as e:
                    print(f"Failed to extract table from page {page}. Error: {e}")
    except Exception as e:
        print(f"Error opening or reading PDF file at path '{pdf_path}'")
        raise e

    return content


def get_response(System_Prompt: str, final_resume_text: str, selected_model="gpt-4"):
    """
    Function used for generating response form OpenAI model
    Here we are Passing the System Prompt and Extracted text from resume.
    """

    client = OpenAI(api_key=OPENAI_API_KEY)
    time.sleep(1)

    if selected_model in ['gpt-4-turbo-preview', 
                          'gpt-3.5-turbo', 
                          'gpt-4-0125-preview', 
                          'gpt-4-1106-preview', 
                          'gpt-3.5-turbo-0125', 
                          'gpt-3.5-turbo-1106']:
        response_format = {"type": "json_object"}
    else:
        response_format = None

    try:
        response = client.chat.completions.create(
            model=selected_model,
            messages=[
                {"role": "system", "content": System_Prompt},
                {"role": "user", "content": final_resume_text}],
            response_format=response_format,
            temperature=0
            )
    except Exception as e:
        print(f"Error creating completion request for model '{selected_model}'")
        raise e

    return response.choices[0].message.content

In [30]:
pdf_path = '/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/Data_Resumes_PDF/Manoj Kumar Ch (1).pdf'
jd_path = '/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/pdf/SRE Cloud Native.pdf'

top_cand_text = get_text(pdf_path)
jd_text = get_text(jd_path)
response = get_response(System_Prompt_summary,f'This is the Job Description Text:\n\n{jd_text}\nThis is the text of resume:{top_cand_text}')
sum = json.loads(response)
cand_summary = sum['Summary'][0]

In [44]:
print(cand_summary)

Manoj Kumar Ch has over 7 years of experience in IT and middleware operations, with a focus on cloud native technologies, making him suitable for the SRE Cloud Native role.


In [45]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

#Load pre-trained embedding model (BERT-based)
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def read_files(path):
    candidates_df = pd.read_excel("./static/jd_skills/employee_skill_matrix.xlsx")
    jd_df = pd.read_excel(path)

    return candidates_df, jd_df

def find_similar_skill_2(current_skill,emp_skills_list,faiss_index,emp_skill_matrix):
        print('skill not in emp sk list ->',current_skill)
        new_address_embedding = model.encode([current_skill])
        new_address_embedding_np = np.array(new_address_embedding)
        # Perform nearest neighbor search
        k = 100  # Number of nearest neighbors to retrieve
        distances, indices = faiss_index.search(new_address_embedding_np, k)

        for idx, distance in zip(indices.flatten(), distances.flatten()):
                #print(emp_skill_matrix[idx],distance)
                if emp_skill_matrix[idx] in emp_skills_list and distance <= 80:
                        print(emp_skill_matrix[idx],distance)

                        return emp_skill_matrix[idx]


def main(path,requisition_id):
    candidates_df, jd_df = read_files(path)
    ###########################
    new_candidates_df = candidates_df.copy()
    new_candidates_df.Technology = new_candidates_df.Technology.apply(lambda x:x.lower())
    emp_skill_matrix = new_candidates_df.Technology.unique().tolist()
    ###########################

    experience = jd_df['Years_of_Experience_required'].iloc[0]

    candidates_df.Technology = candidates_df.Technology.apply(lambda x:x.lower())
    jd_df.Technology = jd_df.Technology.apply(lambda x:x.lower())

    required_technology = jd_df.Technology.to_list()
    candidates_df = candidates_df[candidates_df.Technology.str.contains('|'.join(required_technology))].reset_index(drop=True)

    experience_filter_df = candidates_df[candidates_df['Total_Experience_in_years'] >= experience]

    all_data = pd.DataFrame()
    for group_name, group_data in experience_filter_df.groupby(['Name', 'Total_Experience_in_years']):
        op = group_data[['Technology','Rating']].T
        op.columns = op.iloc[0]
        op.drop('Technology',inplace=True)
        op['Name'] = group_name[0]
        all_data = pd.concat([all_data,op],ignore_index=True)
        
    ddd = pd.DataFrame(columns=['Name'] + required_technology)
    res = pd.concat([ddd,all_data]).fillna(0)[['Name'] + required_technology]
        
    top_N_skills = 5
    filtered_df = res.iloc[:, :top_N_skills+1]
    ###################################

    skills_embeddings = model.encode(emp_skill_matrix)
    skills_embeddings_np = np.array(skills_embeddings)

    # Set up FAISS for nearest neighbor search
    dimension = skills_embeddings_np.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(skills_embeddings_np)

    ndf = filtered_df.copy()
    cdf = new_candidates_df.copy()
    final_final_df = pd.DataFrame()
    for emp in ndf.Name.unique():
        print(emp)
        # find emp from final skill matrx 
        gau_df = ndf[ndf['Name']==emp]

        # find columns with 0
        mask = gau_df == 0
        columns_with_zero = gau_df.columns[mask.any()]

        # find emp skills from overall emp skill matrix
        emp_skills_df = cdf[cdf['Name']==emp]
        emp_skills_list = emp_skills_df.Technology.to_list()

        # iterate through employee skill which is 0 
        for i in columns_with_zero.to_list():
            # find the similar skill
            sim_skill = find_similar_skill_2(i,emp_skills_list,faiss_index,emp_skill_matrix)
            # if similar skill value is not 0 and is present in employee skills then go in if loop
            if sim_skill not in columns_with_zero and sim_skill in emp_skills_list:
                # replace with similar skill rating
                gau_df[i]=emp_skills_df[emp_skills_df.Technology==sim_skill]['Rating'].values[0]
        final_final_df = pd.concat([final_final_df,gau_df])

    print(final_final_df)
    numeric_columns = final_final_df.select_dtypes(include=['number']).columns
    final_final_df[numeric_columns] = final_final_df[numeric_columns] / 2

    #final_final_df['Overall'] = final_final_df.iloc[:,1:].mean(axis=1).round(2)/10*top_N_skills
    final_final_df['Overall'] = final_final_df.iloc[:,1:].mean(axis=1).round(2)#/10*top_N_skills
    final_final_df.sort_values('Overall',ascending=False,ignore_index=True,inplace=True)
    filtered_df = final_final_df.copy()
    # capitalize
    filtered_df['Name'] = filtered_df['Name'].apply(lambda x: ' '.join(word.capitalize() for word in x.split()))
    filtered_df.columns = [col.capitalize() for col in filtered_df.columns]

    #print(filtered_df)
    filtered_df.to_excel(f'./static/Skill_matrix_as_per_JD/Skill_matrix_as_per_JD_{requisition_id}.xlsx',index=False)
    ###################################
    data_dict = []
    for i in range(len(filtered_df)):
        one_dict = {}
        one_dict = filtered_df.iloc[i].to_dict()
        one_dict['id'] = i+1
        data_dict.append(one_dict)
    print('data sent to ui')
    return data_dict

In [46]:
zz = main('/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/jd_skills/Top_Skills_Of_JD_sre01.xlsx','sre01')

ankit dagadu
skill not in emp sk list -> linux systems
linux 19.855038
skill not in emp sk list -> ansible
skill not in emp sk list -> git
kubernetes 78.78024
beri kiran kumar
skill not in emp sk list -> kubernetes
skill not in emp sk list -> linux systems
linux 19.855038
skill not in emp sk list -> git
github 36.383198
manoj kumar ch
skill not in emp sk list -> linux systems
shell scripting 70.963036
vijay ram katam
skill not in emp sk list -> linux systems
linux 19.855038
skill not in emp sk list -> ansible
skill not in emp sk list -> git
gitlab 30.977205
               Name  kubernetes  linux systems  ansible  terraform  git
0      ankit dagadu           8              7        0          8    8
1  beri kiran kumar           0              8        7          8    7
2    manoj kumar ch           8              7        8          8    8
3   vijay ram katam           9              8        0          8    8
data sent to ui


In [48]:
print(zz)

[{'Name': 'Manoj Kumar Ch', 'Kubernetes': 4.0, 'Linux systems': 3.5, 'Ansible': 4.0, 'Terraform': 4.0, 'Git': 4.0, 'Overall': 3.9, 'id': 1}, {'Name': 'Vijay Ram Katam', 'Kubernetes': 4.5, 'Linux systems': 4.0, 'Ansible': 0.0, 'Terraform': 4.0, 'Git': 4.0, 'Overall': 3.3, 'id': 2}, {'Name': 'Ankit Dagadu', 'Kubernetes': 4.0, 'Linux systems': 3.5, 'Ansible': 0.0, 'Terraform': 4.0, 'Git': 4.0, 'Overall': 3.1, 'id': 3}, {'Name': 'Beri Kiran Kumar', 'Kubernetes': 0.0, 'Linux systems': 4.0, 'Ansible': 3.5, 'Terraform': 4.0, 'Git': 3.5, 'Overall': 3.0, 'id': 4}]


In [49]:
path = '/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/jd_skills/Top_Skills_Of_JD_sre01.xlsx'
requisition_id = 'sre01'

In [123]:
candidates_df, jd_df = read_files(path)
###########################
new_candidates_df = candidates_df.copy()
new_candidates_df.Technology = new_candidates_df.Technology.apply(lambda x:x.lower())
emp_skill_matrix = new_candidates_df.Technology.unique().tolist()
###########################

experience = jd_df['Years_of_Experience_required'].iloc[0]

candidates_df.Technology = candidates_df.Technology.apply(lambda x:x.lower())
jd_df.Technology = jd_df.Technology.apply(lambda x:x.lower())

required_technology = jd_df.Technology.to_list()
candidates_df = candidates_df[candidates_df.Technology.str.contains('|'.join(required_technology))].reset_index(drop=True)

experience_filter_df = candidates_df[candidates_df['Total_Experience_in_years'] >= experience]

all_data = pd.DataFrame()
for group_name, group_data in experience_filter_df.groupby(['Name', 'Total_Experience_in_years']):
    op = group_data[['Technology','Rating']].T
    op.columns = op.iloc[0]
    op.drop('Technology',inplace=True)
    op['Name'] = group_name[0]
    all_data = pd.concat([all_data,op],ignore_index=True)
    
ddd = pd.DataFrame(columns=['Name'] + required_technology)
res = pd.concat([ddd,all_data]).fillna(0)[['Name'] + required_technology]
    
top_N_skills = 5
filtered_df = res.iloc[:, :top_N_skills+1]
###################################

skills_embeddings = model.encode(emp_skill_matrix)
skills_embeddings_np = np.array(skills_embeddings)

# Set up FAISS for nearest neighbor search
dimension = skills_embeddings_np.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(skills_embeddings_np)

ndf = filtered_df.copy()
cdf = new_candidates_df.copy()
final_final_df = pd.DataFrame()
for emp in ndf.Name.unique():
    print(emp)
    # find emp from final skill matrx 
    gau_df = ndf[ndf['Name']==emp]

    # find columns with 0
    mask = gau_df == 0
    columns_with_zero = gau_df.columns[mask.any()]

    # find emp skills from overall emp skill matrix
    emp_skills_df = cdf[cdf['Name']==emp]
    emp_skills_list = emp_skills_df.Technology.to_list()

    # iterate through employee skill which is 0 
    for i in columns_with_zero.to_list():
        # find the similar skill
        sim_skill = find_similar_skill_2(i,emp_skills_list,faiss_index,emp_skill_matrix)
        # if similar skill value is not 0 and is present in employee skills then go in if loop
        if sim_skill not in columns_with_zero and sim_skill in emp_skills_list:
            # replace with similar skill rating
            gau_df[i]=emp_skills_df[emp_skills_df.Technology==sim_skill]['Rating'].values[0]
    final_final_df = pd.concat([final_final_df,gau_df])

print(final_final_df)
numeric_columns = final_final_df.select_dtypes(include=['number']).columns
final_final_df[numeric_columns] = final_final_df[numeric_columns] / 2

#final_final_df['Overall'] = final_final_df.iloc[:,1:].mean(axis=1).round(2)/10*top_N_skills
final_final_df['Overall'] = final_final_df.iloc[:,1:].mean(axis=1).round(2)#/10*top_N_skills
final_final_df.sort_values('Overall',ascending=False,ignore_index=True,inplace=True)
filtered_df = final_final_df.copy()

###### code to add summary in top capdidates #######
top_candidates_list  =filtered_df.Name.to_list()[:3]
#top_candidates_list

top_candidates_df = candidates_df[candidates_df.Name.isin(top_candidates_list)][['Name','Resume_Title']].drop_duplicates().reset_index(drop=True)
#top_candidates_df

for i in range(len(top_candidates_df)):
    print(top_candidates_df.Name[i])

    resume_title = top_candidates_df.Resume_Title[i]
    print(resume_title)
    jd_title_name = 'SRE Cloud Native.pdf'



    top_resume_pdf_path = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/Data_Resumes_PDF/{resume_title}'
    jd_path_new = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/pdf/{jd_title_name}'

    top_cand_text = get_text(top_resume_pdf_path)
    jd_text = get_text(jd_path_new)
    summ_response = get_response(System_Prompt_summary,f'This is the Job Description Text:\n\n{jd_text}\nThis is the text of resume:{top_cand_text}')
    sum = json.loads(summ_response)
    cand_summary = sum['Summary']

    top_candidates_df.loc[top_candidates_df['Name'] == top_candidates_df.Name[i], 'Summary'] = str(cand_summary)

filtered_df = filtered_df.merge(top_candidates_df[['Name','Summary']],'left','Name')

###########################


# capitalize
filtered_df['Name'] = filtered_df['Name'].apply(lambda x: ' '.join(word.capitalize() for word in x.split()))
filtered_df.columns = [col.capitalize() for col in filtered_df.columns]

#print(filtered_df)
filtered_df.to_excel(f'./static/Skill_matrix_as_per_JD/Skill_matrix_as_per_JD_{requisition_id}.xlsx',index=False)
###################################
data_dict = []
for i in range(len(filtered_df)):
    one_dict = {}
    one_dict = filtered_df.iloc[i].to_dict()
    one_dict['id'] = i+1
    data_dict.append(one_dict)
print('data sent to ui')


ankit dagadu
skill not in emp sk list -> linux systems
linux 19.855038
skill not in emp sk list -> ansible
skill not in emp sk list -> git
kubernetes 78.78024
beri kiran kumar
skill not in emp sk list -> kubernetes
skill not in emp sk list -> linux systems
linux 19.855038
skill not in emp sk list -> git
github 36.383198
manoj kumar ch
skill not in emp sk list -> linux systems
shell scripting 70.963036
vijay ram katam
skill not in emp sk list -> linux systems
linux 19.855038
skill not in emp sk list -> ansible
skill not in emp sk list -> git
gitlab 30.977205
               Name  kubernetes  linux systems  ansible  terraform  git
0      ankit dagadu           8              7        0          8    8
1  beri kiran kumar           0              8        7          8    7
2    manoj kumar ch           8              7        8          8    8
3   vijay ram katam           9              8        0          8    8
manoj kumar ch
Manoj Kumar Ch (1).pdf
vijay ram katam
Katam Vijay Ram_Cloud 

In [122]:
best_candidates

Unnamed: 0,Name,kubernetes,linux systems,ansible,terraform,git,Overall,Summary
0,manoj kumar ch,4.0,3.5,4.0,4.0,4.0,3.9,['Manoj Kumar Ch has over 7 years of experienc...
1,vijay ram katam,4.5,4.0,0.0,4.0,4.0,3.3,['Vijay Ram Katam has over 5.8 years of experi...
2,ankit dagadu,4.0,3.5,0.0,4.0,4.0,3.1,['Ankit Dagadu has over 8 years of experience ...
3,beri kiran kumar,0.0,4.0,3.5,4.0,3.5,3.0,


In [124]:
filtered_df

Unnamed: 0,Name,Kubernetes,Linux systems,Ansible,Terraform,Git,Overall,Summary
0,Manoj Kumar Ch,4.0,3.5,4.0,4.0,4.0,3.9,['Manoj Kumar Ch has over 7 years of experienc...
1,Vijay Ram Katam,4.5,4.0,0.0,4.0,4.0,3.3,['Vijay Ram Katam has over 5.8 years of experi...
2,Ankit Dagadu,4.0,3.5,0.0,4.0,4.0,3.1,['Ankit Dagadu has over 8 years of experience ...
3,Beri Kiran Kumar,0.0,4.0,3.5,4.0,3.5,3.0,


In [52]:
candidates_df

Unnamed: 0,Technology,Rating,Name,Total_Experience_in_years,Job Role,Resume_Title
0,aws,9,manoj kumar ch,7.0,DevOps Engineer,Manoj Kumar Ch (1).pdf
1,kubernetes,8,manoj kumar ch,7.0,DevOps Engineer,Manoj Kumar Ch (1).pdf
2,jenkins,9,manoj kumar ch,7.0,DevOps Engineer,Manoj Kumar Ch (1).pdf
3,ansible,8,manoj kumar ch,7.0,DevOps Engineer,Manoj Kumar Ch (1).pdf
4,git,8,manoj kumar ch,7.0,DevOps Engineer,Manoj Kumar Ch (1).pdf
5,python,7,manoj kumar ch,7.0,DevOps Engineer,Manoj Kumar Ch (1).pdf
6,terraform,8,manoj kumar ch,7.0,DevOps Engineer,Manoj Kumar Ch (1).pdf
7,azure,9,vijay ram katam,5.8,SRE,Katam Vijay Ram_Cloud Native (1).pdf
8,kubernetes,9,vijay ram katam,5.8,SRE,Katam Vijay Ram_Cloud Native (1).pdf
9,terraform,8,vijay ram katam,5.8,SRE,Katam Vijay Ram_Cloud Native (1).pdf


In [56]:
final_final_df

Unnamed: 0,Name,kubernetes,linux systems,ansible,terraform,git,Overall
0,manoj kumar ch,4.0,3.5,4.0,4.0,4.0,3.9
1,vijay ram katam,4.5,4.0,0.0,4.0,4.0,3.3
2,ankit dagadu,4.0,3.5,0.0,4.0,4.0,3.1
3,beri kiran kumar,0.0,4.0,3.5,4.0,3.5,3.0


In [65]:
top_candidates_list  =final_final_df.Name.to_list()[:3]
#top_candidates_list

top_candidates_df = candidates_df[candidates_df.Name.isin(top_candidates_list)][['Name','Resume_Title']].drop_duplicates().reset_index(drop=True)
#top_candidates_df

for i in range(len(top_candidates_df)):
    print(top_candidates_df.Name[i])

    resume_title = top_candidates_df.Resume_Title[i]
    print(resume_title)
    jd_title_name = 'SRE Cloud Native.pdf'



    top_resume_pdf_path = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/Data_Resumes_PDF/{resume_title}'
    jd_path_new = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/pdf/{jd_title_name}'

    top_cand_text = get_text(top_resume_pdf_path)
    jd_text = get_text(jd_path_new)
    summ_response = get_response(System_Prompt_summary,f'This is the Job Description Text:\n\n{jd_text}\nThis is the text of resume:{top_cand_text}')
    sum = json.loads(summ_response)
    cand_summary = sum['Summary']

    top_candidates_df.loc[top_candidates_df['Name'] == top_candidates_df.Name[i], 'Summary'] = str(cand_summary)

final_final_df.merge(top_candidates_df[['Name','Summary']],'left','Name')


['manoj kumar ch', 'vijay ram katam', 'ankit dagadu']

In [132]:
top_candidates_list  =final_final_df.Name.to_list()[:3]
#top_candidates_list

top_candidates_df = candidates_df[candidates_df.Name.isin(top_candidates_list)][['Name','Resume_Title']].drop_duplicates().reset_index(drop=True)

In [133]:
top_candidates_df

Unnamed: 0,Name,Resume_Title
0,manoj kumar ch,Manoj Kumar Ch (1).pdf
1,vijay ram katam,Katam Vijay Ram_Cloud Native (1).pdf
2,ankit dagadu,NS_AnkitDagadu_SRE (1).pdf


In [134]:
import json

In [138]:
ourpath = '/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/jd_skills/Top_Skills_Of_JD_datascience.xlsx'

In [148]:
def generate_summary(top_candidates_df,i):
    print(top_candidates_df.Name[i])

    resume_title = top_candidates_df.Resume_Title[i]
    print(resume_title)
    jd_title_name = 'SRE Cloud Native.pdf'

    top_resume_pdf_path = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/Data_Resumes_PDF/{resume_title}'
    #jd_path_new = path
    #print(path)
    current_jd = pd.read_excel(ourpath)
    #print(current_jd)
    jd_title_name = current_jd['JD_PDF_Title'][0]
    jd_path_new = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/pdf/{jd_title_name}'

    top_cand_text = get_text(top_resume_pdf_path)
    jd_text = get_text(jd_path_new)
    summ_response = get_response(System_Prompt_summary,f'This is the Job Description Text:\n\n{jd_text}\nThis is the text of resume:{top_cand_text}')
    sum = json.loads(summ_response)
    cand_summary = sum['Summary']
    #print('\n'.join(cand_summary)) 
    #top_candidates_df.loc[top_candidates_df['Name'] == top_candidates_df.Name[i], 'Summary'] = '\n'.join(cand_summary)
    res_df = pd.DataFrame([top_candidates_df.Name[i],'\n'.join(cand_summary)]).T
    res_df.columns = ['Name','Comments']
    return res_df



In [154]:
from joblib import Parallel, delayed


In [155]:
delayed_funcs = [delayed(generate_summary)(top_candidates_df,i) for i in range(len(top_candidates_df))]
parallel_pool = Parallel(n_jobs=joblib.cpu_count())
output_response = parallel_pool(delayed_funcs)
final_df1 = pd.concat(output_response)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

ankit dagadu
manoj kumar chvijay ram katam

NS_AnkitDagadu_SRE (1).pdf
Manoj Kumar Ch (1).pdf
Katam Vijay Ram_Cloud Native (1).pdf


In [158]:
filtered_df

Unnamed: 0,Name,Kubernetes,Linux systems,Ansible,Terraform,Git,Overall,Summary
0,Manoj Kumar Ch,4.0,3.5,4.0,4.0,4.0,3.9,['Manoj Kumar Ch has over 7 years of experienc...
1,Vijay Ram Katam,4.5,4.0,0.0,4.0,4.0,3.3,['Vijay Ram Katam has over 5.8 years of experi...
2,Ankit Dagadu,4.0,3.5,0.0,4.0,4.0,3.1,['Ankit Dagadu has over 8 years of experience ...
3,Beri Kiran Kumar,0.0,4.0,3.5,4.0,3.5,3.0,


In [156]:
final_df1

Unnamed: 0,Name,Comments
0,manoj kumar ch,Manoj Kumar Ch has a strong programming abilit...
0,vijay ram katam,Vijay Ram Katam has over 5.8 years of experien...
0,ankit dagadu,ANKIT DAGADU has around 8 years of experience ...


In [159]:
filtered_df.merge(final_df1,'left','Name')

Unnamed: 0,Name,Kubernetes,Linux systems,Ansible,Terraform,Git,Overall,Summary,Comments
0,Manoj Kumar Ch,4.0,3.5,4.0,4.0,4.0,3.9,['Manoj Kumar Ch has over 7 years of experienc...,
1,Vijay Ram Katam,4.5,4.0,0.0,4.0,4.0,3.3,['Vijay Ram Katam has over 5.8 years of experi...,
2,Ankit Dagadu,4.0,3.5,0.0,4.0,4.0,3.1,['Ankit Dagadu has over 8 years of experience ...,
3,Beri Kiran Kumar,0.0,4.0,3.5,4.0,3.5,3.0,,


In [160]:
filtered_df

Unnamed: 0,Name,Kubernetes,Linux systems,Ansible,Terraform,Git,Overall,Summary
0,Manoj Kumar Ch,4.0,3.5,4.0,4.0,4.0,3.9,['Manoj Kumar Ch has over 7 years of experienc...
1,Vijay Ram Katam,4.5,4.0,0.0,4.0,4.0,3.3,['Vijay Ram Katam has over 5.8 years of experi...
2,Ankit Dagadu,4.0,3.5,0.0,4.0,4.0,3.1,['Ankit Dagadu has over 8 years of experience ...
3,Beri Kiran Kumar,0.0,4.0,3.5,4.0,3.5,3.0,


In [149]:
zz = generate_summary(top_candidates_df,0)

manoj kumar ch
Manoj Kumar Ch (1).pdf


In [151]:
zz

Unnamed: 0,Name,Comments
0,manoj kumar ch,Manoj Kumar Ch has a strong programming abilit...


In [152]:
zz = generate_summary(top_candidates_df,1)
zz

vijay ram katam
Katam Vijay Ram_Cloud Native (1).pdf


Unnamed: 0,Name,Comments
0,vijay ram katam,Vijay Ram Katam has over 5.8 years of experien...


In [153]:
zz = generate_summary(top_candidates_df,2)
zz

ankit dagadu
NS_AnkitDagadu_SRE (1).pdf


Unnamed: 0,Name,Comments
0,ankit dagadu,ANKIT DAGADU has around 8 years of experience ...


In [93]:


pdf_path = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/Data_Resumes_PDF/{na}'
jd_path = f'/Users/omkarg/Downloads/flask_ui_working/Flask_App2/static/pdf/{jd}'

top_cand_text = get_text(pdf_path)
jd_text = get_text(jd_path)
response = get_response(System_Prompt_summary,f'This is the Job Description Text:\n\n{jd_text}\nThis is the text of resume:{top_cand_text}')
sum = json.loads(response)
cand_summary = sum['Summary']

top_candidates_df.loc[top_candidates_df['Name'] == 'manoj kumar ch', 'Summary'] = str(cand_summary)


In [129]:
top_candidates_df.loc[top_candidates_df['Name'] == 'manoj kumar ch', 'Summary'] = cand_summary


ValueError: Must have equal len keys and value when setting with an iterable

In [128]:
cand_summary

['Ankit Dagadu has over 8 years of experience in IT, including hands-on experience as an SRE with a focus on cloud native technologies such as Azure, Kubernetes, and Terraform, which aligns with the job requirement.',
 'He has a strong understanding of networking concepts and experience in configuring and administering Linux systems in cloud/SaaS production environments, which is a mandatory qualification for the job role.',
 'Ankit has experience in deploying, managing, and troubleshooting Kubernetes clusters, which is a key requirement for the job role.',
 'He has demonstrated experience in delivering infrastructure as code using Terraform, Git, and Azure DevOps, which are preferred qualifications for the job role.',
 'Ankit has experience with monitoring and logging systems such as Prometheus and Grafana, and he has knowledge of cloud computing technologies like Azure, which are preferred qualifications for the job role.']

In [None]:

top_candidates_df

Unnamed: 0,Name,Resume_Title,Summary
0,manoj kumar ch,Manoj Kumar Ch (1).pdf,Vijay Ram Katam has over 5.8 years of experien...
1,vijay ram katam,Katam Vijay Ram_Cloud Native (1).pdf,
2,ankit dagadu,NS_AnkitDagadu_SRE (1).pdf,


In [102]:
top_candidates_df

Unnamed: 0,Name,Resume_Title,Summary
0,manoj kumar ch,Manoj Kumar Ch (1).pdf,['Manoj Kumar Ch has over 7 years of experienc...
1,vijay ram katam,Katam Vijay Ram_Cloud Native (1).pdf,['Vijay Ram Katam has over 5.8 years of experi...
2,ankit dagadu,NS_AnkitDagadu_SRE (1).pdf,['Ankit Dagadu has over 8 years of experience ...


In [103]:
final_final_df

Unnamed: 0,Name,kubernetes,linux systems,ansible,terraform,git,Overall
0,manoj kumar ch,4.0,3.5,4.0,4.0,4.0,3.9
1,vijay ram katam,4.5,4.0,0.0,4.0,4.0,3.3
2,ankit dagadu,4.0,3.5,0.0,4.0,4.0,3.1
3,beri kiran kumar,0.0,4.0,3.5,4.0,3.5,3.0


In [106]:
final_final_df.merge(top_candidates_df[['Name','Summary']],'left','Name')

Unnamed: 0,Name,kubernetes,linux systems,ansible,terraform,git,Overall,Summary
0,manoj kumar ch,4.0,3.5,4.0,4.0,4.0,3.9,['Manoj Kumar Ch has over 7 years of experienc...
1,vijay ram katam,4.5,4.0,0.0,4.0,4.0,3.3,['Vijay Ram Katam has over 5.8 years of experi...
2,ankit dagadu,4.0,3.5,0.0,4.0,4.0,3.1,['Ankit Dagadu has over 8 years of experience ...
3,beri kiran kumar,0.0,4.0,3.5,4.0,3.5,3.0,


Unnamed: 0,Name,Kubernetes,Linux systems,Ansible,Terraform,Git,Overall,Summary
0,Manoj Kumar Ch,4.0,3.5,4.0,4.0,4.0,3.9,
1,Vijay Ram Katam,4.5,4.0,0.0,4.0,4.0,3.3,
2,Ankit Dagadu,4.0,3.5,0.0,4.0,4.0,3.1,
3,Beri Kiran Kumar,0.0,4.0,3.5,4.0,3.5,3.0,


In [125]:
ls = ['Mohsin has a strong background in machine learning and text analytics/NLP, as evidenced by his work on AI/ML frameworks for Fintech and HR Tech domains, and his experience with NLP models like GPT3 and RoBERTA.', "He has a strong programming ability in Python and has used Python's data science ecosystem extensively in his work, including libraries like pandas, numpy, and nltk.", 'Mohsin has good knowledge of database query languages like SQL and has experience processing large-scale data using Python and other database management tools.', 'He has excellent analytical and problem-solving skills, demonstrated by his work on complex business problems, such as improving existing NLP models, developing statistical models, and extracting important information from unstructured data.', 'Mohsin has a degree in Engineering and a Post Graduation Diploma in Machine Learning, and he has proven experience as a Data Scientist, having worked as a Consultant-Data Science Tech Specialist in AI/ML.']

In [130]:
print('\n'.join(cand_summary))

Ankit Dagadu has over 8 years of experience in IT, including hands-on experience as an SRE with a focus on cloud native technologies such as Azure, Kubernetes, and Terraform, which aligns with the job requirement.
He has a strong understanding of networking concepts and experience in configuring and administering Linux systems in cloud/SaaS production environments, which is a mandatory qualification for the job role.
Ankit has experience in deploying, managing, and troubleshooting Kubernetes clusters, which is a key requirement for the job role.
He has demonstrated experience in delivering infrastructure as code using Terraform, Git, and Azure DevOps, which are preferred qualifications for the job role.
Ankit has experience with monitoring and logging systems such as Prometheus and Grafana, and he has knowledge of cloud computing technologies like Azure, which are preferred qualifications for the job role.


In [161]:
import pdfplumber

In [184]:
!pip install textract

Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl.metadata (2.5 kB)
Collecting argcomplete~=1.10.0 (from textract)
  Downloading argcomplete-1.10.3-py2.py3-none-any.whl.metadata (16 kB)
Collecting beautifulsoup4~=4.8.0 (from textract)
  Downloading beautifulsoup4-4.8.2-py3-none-any.whl.metadata (4.1 kB)
Collecting chardet==3.* (from textract)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting docx2txt~=0.8 (from textract)
  Using cached docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting extract-msg<=0.29.* (from textract)
  Downloading extract_msg-0.28.7-py2.py3-none-any.whl.metadata (7.8 kB)
Collecting pdfminer.six==20191110 (from textract)
  Downloading pdfminer.six-20191110-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting python-pptx~=0.6.18 (from textract)
  Using cached python_pptx-0.6.23-py3-none-any.whl.metadata (18 kB)
Collecting six~=1.12.0 (from textract)
  Downloading six-1.12.0-py2.py3-non

In [195]:
import textract
text1 = textract.process("NS_Prachi-Bhoj.docx")

In [200]:
text1

b'Prachi Bhoj\n\n\xe2\x80\xa2 Country: India \xe2\x80\xa2 Tel: +918983188895 \xe2\x80\xa2 Email: prachideore19@gmail.com\n\n \xe2\x80\xa2 LinkedIn: https://www.linkedin.com/in/prachi-bhoj/\n\n\n\n Summary:\n\n_______________________________________________________________________________________________\n\n\n\nResults-driven and highly skilled DevOps plus Certified Cloud Infrastructure engineer with 4 + years of experience in designing, implementing, and managing robust continuous integration, delivery, and deployment pipelines. \n\nProven expertise in automating, optimizing, and streamlining complex workflows, resulting in increased operational efficiency and reduced time-to-market. \n\nProficient in leveraging a wide range of DevOps tools and technologies, including but not limited to Jenkins, Docker, Kubernetes, Ansible, and Git. \n\nAdept at collaborating with cross-functional teams to bridge the gap between development and operations, fostering a culture of continuous improvement,

In [203]:
dir_path

NameError: name 'dir_path' is not defined

In [210]:
[f for f in os.listdir() if f.endswith('.pdf') or f.endswith('.docx')]

['NS_Prachi-Bhoj.docx', 'Milind Gharat_SRE- AI Infrastructure _Ellicium.pdf']

In [206]:
for f in os.listdir():
    if f.endswith('.pdf') or f.endswith('.docx'):
        print(f)

NS_Prachi-Bhoj.docx
Milind Gharat_SRE- AI Infrastructure _Ellicium.pdf


In [193]:
type(text)

bytes

In [202]:
!pip install docx2txt

[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [217]:
z.d

<function docx2txt.docx2txt.process(docx, img_dir=None)>

In [199]:
import docx2txt
my_text = docx2txt.process("NS_Prachi-Bhoj.docx")
print(my_text)

Prachi Bhoj

• Country: India • Tel: +918983188895 • Email: prachideore19@gmail.com

 • LinkedIn: https://www.linkedin.com/in/prachi-bhoj/



 Summary:

_______________________________________________________________________________________________



Results-driven and highly skilled DevOps plus Certified Cloud Infrastructure engineer with 4 + years of experience in designing, implementing, and managing robust continuous integration, delivery, and deployment pipelines. 

Proven expertise in automating, optimizing, and streamlining complex workflows, resulting in increased operational efficiency and reduced time-to-market. 

Proficient in leveraging a wide range of DevOps tools and technologies, including but not limited to Jenkins, Docker, Kubernetes, Ansible, and Git. 

Adept at collaborating with cross-functional teams to bridge the gap between development and operations, fostering a culture of continuous improvement, Commitment to staying current with industry trends and best pract

In [191]:
f = open('NS_Prachi-Bhoj.docx',encoding='utf-8')
f.read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x90 in position 14: invalid start byte

In [175]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [178]:
!pip install exceptions

[31mERROR: Could not find a version that satisfies the requirement exceptions (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for exceptions[0m[31m
[0m

In [174]:
import docx2text

ModuleNotFoundError: No module named 'docx2text'

In [177]:
import docx

def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

ModuleNotFoundError: No module named 'exceptions'

In [164]:
with pdfplumber.open('static/Data_Resumes_PDF/SHIVANI_ MOZE_RESUME (3) (1).pdf') as pdf:
    print(pdf)
    for page in pdf.pages:
        extracted_text = page.extract_text()
        print(extracted_text)

<pdfplumber.pdf.PDF object at 0x301fde150>
4/83 Ganesh Nagar Navi Khadki
SHIVANI Yerwada Pune:-411006
9075898407
MOZE Shivani27moze@gmail.com
https:/www.linkedin.com/in/shivani-
moze-586240223
OBJECTIVE
To work in environment where I can apply my knowledge to improve my practical
approach and put my efforts for the prosperity of the industry.
EDUCATION
Qualification University College Pass Percentage/
/Board Name/Branch Year CGPA
Name
B.E AISSMS IOIT, 2023 9.32
T.E SPPU Pune-01 2022 9.55
S.E ,Pune Instrumentation 2021 9.30
Engineering
F.E 2020 7.68
HSC SS PM Day School 2019 61.54%
SSC State Board &Jr.College 2017 77%
Pune:-01
PROJECT
• DATA ANALYSIS USING HOSPITAL DATASET
Conducted exploratory analysis on a hospital dataset sourced from Kaggle using
Python, emphasizing data visualization with Matplotlib.
Skills Utilized: Python, Data Visualization (Matplotlib).
• T.E MINI PROJECT:-SATELLITE IMAGERY CLASSIFICATION USING
PYTHON AND QGIS SOFTWARE
Classified satellite images obtained from 

In [18]:
pdf_p = 'static/jd_skills'
[f.split('.')[0].split('_')[-1] for f in os.listdir(pdf_p) if f.startswith('Top_Skills_Of_JD')]

['ds01', 'sre01']

In [15]:
selected_model = 'gpt-4'

In [16]:
#def get_response(System_Prompt: str, final_resume_text: str, selected_model="gpt-4"):
"""
Function used for generating response form OpenAI model
Here we are Passing the System Prompt and Extracted text from resume.
"""

client = OpenAI(api_key="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")
#time.sleep(1)

if selected_model in ['gpt-4-turbo-preview',
                        'gpt-3.5-turbo',
                        'gpt-4-0125-preview',
                        'gpt-4-1106-preview',
                        'gpt-3.5-turbo-0125',
                        'gpt-3.5-turbo-1106']:
    response_format = {"type": "json_object"}
else:
    response_format = None

try:
    response = client.chat.completions.create(
        model=selected_model,
        messages=[
            {"role": "system", "content": 'you are a question answer bot'},
            {"role": "user", "content": 'what is pythyon'}],
        response_format=response_format,
        temperature=0
        )
except Exception as e:
    print(f"Error creating completion request for model '{selected_model}'")
    raise e

#return response.choices[0].message.content

In [17]:
response.choices[0].message.content

"Python is a high-level, interpreted programming language created by Guido van Rossum and first released in 1991. It's known for its clear syntax and readability, which reduces the cost of program maintenance. Python supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It is widely used for web development, data analysis, artificial intelligence, scientific computing, and more."