In [2]:
# Install all the required libraries
!pip install -U -q pdfplumber tiktoken openai chromadb sentence-transformers

In [3]:
# Import all the required libraries

import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb

In [4]:
# Define PDF path
pdf_path = "99425-Sravanthi_Java-Lead_13-yrs_.pdf"

In [5]:
# Open the PDF file and print the text
with pdfplumber.open(pdf_path) as pdf:

    # Get one of the pages from the PDF and examine it
    single_page = pdf.pages[5]

    # Extract text from the first page
    text = single_page.extract_text()

    # Extract tables from the first page
    tables = single_page.extract_tables()

    # Print the extracted text
    print(text)

Client Standard Chartered Bank
Project Title CEMS – ICM
Period APR 2014 to APR 2016
Position Associate-Projects
Description
CEMS - ICM project is being developed for the standard chartered bank’s
branches and customer service teams in countries like Singapore and Malaysia
to address the request of the customers related to banking. This application is
developed for the new approach, which the standard chartered is adapting to
increase the customer experience with bank by giving customer a facility to get
all the work done with one request number instead of making multiple request
and multiple request numbers.
Technology Java, J2ee, Spring MVC, Hibernate, JSP
Responsibilities
• Played as an Associate Projects and involved in development.
• Handled requirement gathering by coordinating with the team
lead and confirmed the architecture structure.
• Involved in the design, coding, unit testing and system testing
of the modules, preparation of test cases, peer to peer code
review.
• Develope

In [6]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [7]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [8]:

# Initialize an empty list to store the extracted texts and document names
data = []

# Process the PDF file
print(f"...Processing {pdf_path}")

# Call the function to extract the text from the PDF
extracted_text = extract_text_from_pdf(pdf_path)

# Convert the extracted list to a PDF, and add a column to store document names
extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])

# Append the extracted text and document name to the list
data.append(extracted_text_df)

# Print a message to indicate progress
print(f"Finished processing {pdf_path}")

# Print a message to indicate all PDFs have been processed
print("PDF have been processed.")

...Processing 99425-Sravanthi_Java-Lead_13-yrs_.pdf
Finished processing 99425-Sravanthi_Java-Lead_13-yrs_.pdf
PDF have been processed.


In [9]:
# print the text
data

[  Page No.                                          Page_Text
 0   Page 1  Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...
 1   Page 2  CAREER PROFILE [["", "Dates", "", "", "Organiz...
 2   Page 3  [["", "\u2022 Developed the API using Restful ...
 3   Page 4  [["", "\u2022 Developed the API using Restful ...
 4   Page 5  [["", "\u2022 Collaborate with cross-functiona...
 5   Page 6  [["Client", "Standard Chartered Bank"], ["Proj...]

In [10]:
resume_data = pd.concat(data, ignore_index=True)

In [11]:
resume_data

Unnamed: 0,Page No.,Page_Text
0,Page 1,Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...
1,Page 2,"CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz..."
2,Page 3,"[["""", ""\u2022 Developed the API using Restful ..."
3,Page 4,"[["""", ""\u2022 Developed the API using Restful ..."
4,Page 5,"[["""", ""\u2022 Collaborate with cross-functiona..."
5,Page 6,"[[""Client"", ""Standard Chartered Bank""], [""Proj..."


In [12]:
resume_data['Metadata'] = resume_data.apply(lambda x: {'Page_No.': x['Page No.']}, axis=1)
resume_data

Unnamed: 0,Page No.,Page_Text,Metadata
0,Page 1,Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,{'Page_No.': 'Page 1'}
1,Page 2,"CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz...",{'Page_No.': 'Page 2'}
2,Page 3,"[["""", ""\u2022 Developed the API using Restful ...",{'Page_No.': 'Page 3'}
3,Page 4,"[["""", ""\u2022 Developed the API using Restful ...",{'Page_No.': 'Page 4'}
4,Page 5,"[["""", ""\u2022 Collaborate with cross-functiona...",{'Page_No.': 'Page 5'}
5,Page 6,"[[""Client"", ""Standard Chartered Bank""], [""Proj...",{'Page_No.': 'Page 6'}


In [13]:
resume_data.Page_Text[2]

'[["", "\\u2022 Developed the API using Restful Web Services.\\n\\u2022 Responsibilities resource allocation, requirements\\ngathering and definition, high and low- level design and\\nsoftware architecture, development, implementation,\\ntesting and support.\\n\\u2022 Collaborate with cross-functional team leads, business\\nanalysts, Customers and other key stakeholders throughout\\nall phases of the projects.\\n\\u2022 Created high and low-level system design/impact\\ndocuments using MS Word\\n\\u2022 Involved peer to peer code review and check the coding\\nstandards.\\n\\u2022 Deployment of Web application on test and production\\nenvironment.\\n\\u2022 Organizing and motivating a project team\\n\\u2022 Controlling time management and Ensuring customer\\nsatisfaction\\n\\u2022 Working experience on Agile-Scrum development\\nMethodology."]] Project 2 – PRD [["Client", "FedEx"], ["Project Title", "Pilot Record Data"], ["Period", "MAR 2022 to FEB 2023"], ["Position", "Technical Lead"], 

In [14]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop
resume_data['Text_Length'] = resume_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [15]:
resume_data['Text_Length']

0    172
1    263
2    224
3    224
4    206
5    257
Name: Text_Length, dtype: int64

In [16]:
resume_data

Unnamed: 0,Page No.,Page_Text,Metadata,Text_Length
0,Page 1,Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,{'Page_No.': 'Page 1'},172
1,Page 2,"CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz...",{'Page_No.': 'Page 2'},263
2,Page 3,"[["""", ""\u2022 Developed the API using Restful ...",{'Page_No.': 'Page 3'},224
3,Page 4,"[["""", ""\u2022 Developed the API using Restful ...",{'Page_No.': 'Page 4'},224
4,Page 5,"[["""", ""\u2022 Collaborate with cross-functiona...",{'Page_No.': 'Page 5'},206
5,Page 6,"[[""Client"", ""Standard Chartered Bank""], [""Proj...",{'Page_No.': 'Page 6'},257


In [17]:
resume_data['Page_Text']

0    Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...
1    CAREER PROFILE [["", "Dates", "", "", "Organiz...
2    [["", "\u2022 Developed the API using Restful ...
3    [["", "\u2022 Developed the API using Restful ...
4    [["", "\u2022 Collaborate with cross-functiona...
5    [["Client", "Standard Chartered Bank"], ["Proj...
Name: Page_Text, dtype: object

In [18]:
page_nos = resume_data["Page No."]
page_nos

0    Page 1
1    Page 2
2    Page 3
3    Page 4
4    Page 5
5    Page 6
Name: Page No., dtype: object

In [19]:
# Function to split text into fixed-size chunks
def split_text_into_chunks(text, chunk_size):
    chunks = []
    words = text.split()  # Split the text into words

    current_chunk = []  # Store words for the current chunk
    current_chunk_word_count = 0  # Count of words in the current chunk

    for word in words:
        if current_chunk_word_count + len(word) + 1 <= chunk_size:
            current_chunk.append(word)
            current_chunk_word_count += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_chunk_word_count = len(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks## 3. <font color = 'red'> Generating Embeddings </font>

#### Encoding Pipeline

In [20]:
def process_page(page_no):
    page = resume_data[resume_data['Page No.'] == page_no].Page_Text.values[0]
    metadata = resume_data[resume_data['Page No.'] == page_no].Metadata.values[0]

    if page is not None:
        # setting chunk size as 500
        chunk_size = 500
        text_chunks = split_text_into_chunks(page, chunk_size)

        # Creating a DataFrame to store the chunks, page title and page metadata
        data = {'Title': [], 'Chunk Text': [], 'Metadata': []}

        for index, chunk in enumerate(text_chunks):
            data['Title'].append(page_no)
            data['Chunk Text'].append(chunk)
            # adding chunk no as part of metadata
            metadata['Chunk_No.'] = index
            data['Metadata'].append(metadata)

        return pd.DataFrame(data)

In [21]:
# creating a dataframe after calling process
all_dfs = []
for page_no in page_nos:
    df = process_page(page_no)
    if df is not None:
        all_dfs.append(df)

fixed_chunk_df = pd.concat(all_dfs, ignore_index=True)
fixed_chunk_df

Unnamed: 0,Title,Chunk Text,Metadata
0,Page 1,Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,"{'Page_No.': 'Page 1', 'Chunk_No.': 2}"
1,Page 1,skills and Inter-personal abilities. EDUCATION...,"{'Page_No.': 'Page 1', 'Chunk_No.': 2}"
2,Page 1,"Angular10, NodeJS and TypeScript.""], [""Web Ser...","{'Page_No.': 'Page 1', 'Chunk_No.': 2}"
3,Page 2,"CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz...","{'Page_No.': 'Page 2', 'Chunk_No.': 3}"
4,Page 2,"""Senior Consultant"", null, null], [""APR 2014 t...","{'Page_No.': 'Page 2', 'Chunk_No.': 3}"
5,Page 2,developed for the standard chartered bank\u201...,"{'Page_No.': 'Page 2', 'Chunk_No.': 3}"
6,Page 2,"Spring Boot, Rest API, Spring data JPA, Angula...","{'Page_No.': 'Page 2', 'Chunk_No.': 3}"
7,Page 3,"[["""", ""\u2022 Developed the API using Restful ...","{'Page_No.': 'Page 3', 'Chunk_No.': 3}"
8,Page 3,Involved peer to peer code review and check th...,"{'Page_No.': 'Page 3', 'Chunk_No.': 3}"
9,Page 3,"""Pilot Record Database is a product. It suppor...","{'Page_No.': 'Page 3', 'Chunk_No.': 3}"


In [22]:
# Install the sentence transformers library
!pip install -q -u sentence-transformers


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -u


In [23]:
from sentence_transformers import SentenceTransformer, util

In [24]:
# Import the SentenceTransformer library
from sentence_transformers import SentenceTransformer

In [25]:
# Load the embedding model
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

In [26]:
# Function to generate embeddings for text
def generate_embeddings(texts):
    embeddings = embedder.encode(texts, convert_to_tensor=False)
    return embeddings

In [27]:
# function to generate embedding on dataframe
def generate_embeddings_on_df(df):
  df['Embeddings'] = df['Chunk Text'].apply(lambda x: generate_embeddings([x])[0])

In [28]:
generate_embeddings_on_df(fixed_chunk_df)

In [29]:
fixed_chunk_df

Unnamed: 0,Title,Chunk Text,Metadata,Embeddings
0,Page 1,Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,"{'Page_No.': 'Page 1', 'Chunk_No.': 2}","[-0.078678355, 0.028704114, -0.04743235, -0.04..."
1,Page 1,skills and Inter-personal abilities. EDUCATION...,"{'Page_No.': 'Page 1', 'Chunk_No.': 2}","[-0.097121954, -0.029366612, 0.007973225, -0.0..."
2,Page 1,"Angular10, NodeJS and TypeScript.""], [""Web Ser...","{'Page_No.': 'Page 1', 'Chunk_No.': 2}","[-0.060519606, -0.0723971, -0.0044179205, -0.0..."
3,Page 2,"CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz...","{'Page_No.': 'Page 2', 'Chunk_No.': 3}","[-0.03987785, -0.010995714, -0.054029126, 0.04..."
4,Page 2,"""Senior Consultant"", null, null], [""APR 2014 t...","{'Page_No.': 'Page 2', 'Chunk_No.': 3}","[-0.002874897, 0.0085472725, -0.036403563, 0.0..."
5,Page 2,developed for the standard chartered bank\u201...,"{'Page_No.': 'Page 2', 'Chunk_No.': 3}","[-0.054679748, 0.02421834, -0.075535975, -0.08..."
6,Page 2,"Spring Boot, Rest API, Spring data JPA, Angula...","{'Page_No.': 'Page 2', 'Chunk_No.': 3}","[-0.07667003, 0.00047007622, -0.04085703, -0.0..."
7,Page 3,"[["""", ""\u2022 Developed the API using Restful ...","{'Page_No.': 'Page 3', 'Chunk_No.': 3}","[-0.057539728, 0.068540014, -0.03772124, -0.04..."
8,Page 3,Involved peer to peer code review and check th...,"{'Page_No.': 'Page 3', 'Chunk_No.': 3}","[-0.0042178044, 0.03810776, -0.083652265, -0.0..."
9,Page 3,"""Pilot Record Database is a product. It suppor...","{'Page_No.': 'Page 3', 'Chunk_No.': 3}","[-0.027975693, 0.022102037, -0.13101754, -0.06..."


In [30]:
!pip install chromadb



In [31]:
# Define the path where chroma collections will be stored
chroma_data_path = 'b./ChromaDB_Data/'

In [32]:
import chromadb

# Call PersistentClient()
client = chromadb.PersistentClient(path=chroma_data_path)

In [33]:
# Create a collection to store the embeddings. Collections in Chroma are where you can store your embeddings, documents, and any additional metadata.
collection = client.get_or_create_collection(name="insurance-collection")

In [34]:
collection.add(
    embeddings = fixed_chunk_df['Embeddings'].to_list(),
    documents = fixed_chunk_df['Chunk Text'].to_list(),
    metadatas = fixed_chunk_df['Metadata'].to_list(),
    ids = [str(i) for i in range(0, len(fixed_chunk_df['Embeddings']))]
)

In [35]:
# get few of data by ids from collection
collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-0.07867835,  0.02870411, -0.04743235, ..., -0.07027438,
         -0.06663264,  0.04997705],
        [-0.09712195, -0.02936661,  0.00797322, ...,  0.00042017,
         -0.06090006,  0.00988888],
        [-0.06051961, -0.0723971 , -0.00441792, ..., -0.02867442,
          0.02420143,  0.00107546]]),
 'documents': ['Resume M.SRAVANTHI Phone: +91- 9360487656 Mail Id: sravanthim1311@gmail.com ________________________________________________________________________________ PROFESSIONAL SUMMARY • Overall 13+ years of IT experience in software development in creating solutions for Web based applications using MVC architecture for catering to industry standard methodologies and techniques. • Extensive knowledge in applying different technologies and Frameworks. • Good knowledge on AWS. • Good Communication',
  'skills and Inter-personal abilities. EDUCATIONAL QUALIFICATION • Completed MCA. At 2009 in Sengunthar Engineering College from Anna Univer

In [36]:
# create a cache collection
cache_collection = client.get_or_create_collection(name='insurance-collection-cache')

In [37]:
# peek few of elements from cache collection
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [38]:
# Read the user query
query = input()

 total years of experience


In [40]:
# Searh the Cache collection first
# Query the collection against the user query and return the top 20 results
cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [41]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [42]:
# get result from main collection
results = collection.query(
query_texts=query,
n_results=10
)
print("Result size is : " + str(len(results.items())))
results.items()

Result size is : 8


dict_items([('ids', [['1', '3', '4', '14', '17', '12', '6', '10', '0', '8']]), ('embeddings', None), ('documents', [['skills and Inter-personal abilities. EDUCATIONAL QUALIFICATION • Completed MCA. At 2009 in Sengunthar Engineering College from Anna University, Tamil Nadu. • Completed Java course from AUG 2009 to FEB 2011. SKILLS PROFILE [["Languages used", "Java, J2EE"], ["Operating Systems", "Windows7, Windows10"], ["J2EE Technologies", "Servlets and Java Server Page, DevOps, Safe Agile"], ["Web Technologies &\\nFrameworks", "Spring MVC, Spring Boot, Micro Services, Struts, JSP, HTML,CSS,\\nJava Script,', 'CAREER PROFILE [["", "Dates", "", "", "Organization", "", "", "Role", ""], ["MAY 2023 to OCT 2024", null, null, "Luxoft India Pvt (Ltd)", null, null, "Lead Engineer", null, null], ["FEB 2022 to MAR 2023", null, null, "Wipro Technologies", null, null, "Technical Lead", null, null], ["AUG 2019 to FEB 2022", null, null, "TATA Consultancy Services\\nLimited", null, null, "Assistant Con

In [43]:
# Implementing Cache in Semantic Search

# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()


# If the distance is greater than the threshold, then return the results from the main collection.

if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = collection.query(
      query_texts=query,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      size = len(results.items())

      for key, val in results.items():
        if val is None:
          continue
        for i in range(size):
          Keys.append(str(key)+str(i))
          Values.append(str(val[0][i]))


      cache_collection.add(
          documents= [query],
          ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

elif cache_results['distances'][0][0] <= threshold:
      cache_result_dict = cache_results['metadatas'][0][0]

      # Loop through each inner list and then through the dictionary
      for key, value in cache_result_dict.items():
          if 'ids' in key:
              ids.append(value)
          elif 'documents' in key:
              documents.append(value)
          elif 'distances' in key:
              distances.append(value)
          elif 'metadatas' in key:
              metadatas.append(value)

      print("Found in cache!")

      # Create a DataFrame
      results_df = pd.DataFrame({
        'IDs': ids,
        'Documents': documents,
        'Distances': distances,
        'Metadatas': metadatas
      })


Not found in cache. Found in main collection.


In [44]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}",skills and Inter-personal abilities. EDUCATION...,1.291612,1
1,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz...",1.404284,3
2,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","""Senior Consultant"", null, null], [""APR 2014 t...",1.415564,4
3,"{'Chunk_No.': 3, 'Page_No.': 'Page 4'}",development activities.\n\u2022 Lead the devel...,1.434458,14
4,"{'Chunk_No.': 3, 'Page_No.': 'Page 5'}","Consultant"", """"], [""Technology"", ""Java, J2ee, ...",1.44708,17
5,"{'Chunk_No.': 3, 'Page_No.': 'Page 4'}",customer\nsatisfaction\n\u2022 Working experie...,1.448344,12
6,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","Spring Boot, Rest API, Spring data JPA, Angula...",1.463269,6
7,"{'Chunk_No.': 3, 'Page_No.': 'Page 3'}","Angular10,\nPostgreeSQL.""], [""Responsibilities...",1.478196,10
8,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}",Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,1.541003,0
9,"{'Chunk_No.': 3, 'Page_No.': 'Page 3'}",Involved peer to peer code review and check th...,1.577903,8


In [45]:
from sentence_transformers import CrossEncoder, util

In [46]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [47]:
cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)

In [48]:
cross_rerank_scores

array([-10.8323345, -11.204613 , -11.351702 , -11.066553 , -11.34433  ,
       -10.669049 , -11.098743 , -11.017324 ,  -5.2462463, -11.169449 ],
      dtype=float32)

In [49]:
results_df['Reranked_scores'] = cross_rerank_scores

In [50]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}",skills and Inter-personal abilities. EDUCATION...,1.291612,1,-10.832335
1,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz...",1.404284,3,-11.204613
2,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","""Senior Consultant"", null, null], [""APR 2014 t...",1.415564,4,-11.351702
3,"{'Chunk_No.': 3, 'Page_No.': 'Page 4'}",development activities.\n\u2022 Lead the devel...,1.434458,14,-11.066553
4,"{'Chunk_No.': 3, 'Page_No.': 'Page 5'}","Consultant"", """"], [""Technology"", ""Java, J2ee, ...",1.44708,17,-11.34433
5,"{'Chunk_No.': 3, 'Page_No.': 'Page 4'}",customer\nsatisfaction\n\u2022 Working experie...,1.448344,12,-10.669049
6,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","Spring Boot, Rest API, Spring data JPA, Angula...",1.463269,6,-11.098743
7,"{'Chunk_No.': 3, 'Page_No.': 'Page 3'}","Angular10,\nPostgreeSQL.""], [""Responsibilities...",1.478196,10,-11.017324
8,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}",Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,1.541003,0,-5.246246
9,"{'Chunk_No.': 3, 'Page_No.': 'Page 3'}",Involved peer to peer code review and check th...,1.577903,8,-11.169449


In [51]:
top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}",skills and Inter-personal abilities. EDUCATION...,1.291612,1,-10.832335
1,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","CAREER PROFILE [["""", ""Dates"", """", """", ""Organiz...",1.404284,3,-11.204613
2,"{'Chunk_No.': 3, 'Page_No.': 'Page 2'}","""Senior Consultant"", null, null], [""APR 2014 t...",1.415564,4,-11.351702


In [52]:
top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
8,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}",Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,1.541003,0,-5.246246
5,"{'Chunk_No.': 3, 'Page_No.': 'Page 4'}",customer\nsatisfaction\n\u2022 Working experie...,1.448344,12,-10.669049
0,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}",skills and Inter-personal abilities. EDUCATION...,1.291612,1,-10.832335


In [53]:
top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]
top_3_RAG

Unnamed: 0,Documents,Metadatas
8,Resume M.SRAVANTHI Phone: +91- 9360487656 Mail...,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}"
5,customer\nsatisfaction\n\u2022 Working experie...,"{'Chunk_No.': 3, 'Page_No.': 'Page 4'}"
0,skills and Inter-personal abilities. EDUCATION...,"{'Chunk_No.': 2, 'Page_No.': 'Page 1'}"


In [54]:
!ollama pull llama3.2


[?25lpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¸ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ´ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest â § [?25h[?25l[2K[1Gpulling manifest â ‡ [?25h[?25l[2K[1Gpulling manifest â � [?25h[?25l[2K[1Gpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¸ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ´ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest 
pulling dde5aa3fc5ff... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 2.0 GB                         
pulling 966de95ca8a6... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 1.4 KB                         
pulling fcc5a6bec9da... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ

In [55]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [56]:
import requests
import json
import gradio as gr

OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

# Initialize the OpenAI client for Ollama integration
ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')

In [71]:
def generate_response(query, resume_df):
    """
    Generate a response based on the user's query and information from the uploaded resume.
    
    Args:
    query (str): The user's question regarding the resume.
    resume_df (pd.DataFrame): A dataframe containing the resume's content. The dataframe has columns 'sections' and 'content'.
                               'sections' indicates the section of the resume (e.g., Education, Work Experience, Skills),
                               and 'content' contains the actual text from those sections.
    
    Returns:
    str: A detailed response answering the user's query based on the resume.
    """
    messages = [
        {"role": "system", "content": "You are a helpful assistant who can effectively answer user queries about resumes. Your job is to provide insights from the uploaded resume based on the questions asked."},
        {"role": "user", "content": f"""You are a helpful assistant who can effectively answer user queries about resumes. You have a question asked by the user in '{query}' and you have some information extracted from the resume in the dataframe '{resume_df}'.

        The dataframe contains two columns: 'sections' and 'content'. The 'sections' column refers to different sections of the resume, such as 'Education', 'Work Experience', 'Skills', and others, while the 'content' column contains the actual content from these sections.

        Use the sections and content to answer the user's query '{query}'. If the answer involves multiple sections, consider the relevant information from all those sections. If the resume contains structured data like dates, job titles, or skills, provide the information in a clear and organized format.

        Follow these guidelines:
        1. If specific information (like education or experience) is requested, ensure to extract and present that.
        2. If a section contains relevant structured data (e.g., work experience with dates and roles), reformat and present it clearly.
        3. If the question cannot be answered from the resume, clearly mention that the information is not available.
        4. Always try to be concise, direct, and clear in your response.

        Provide your complete response first, followed by any relevant section or data references that may help the user locate the information in the resume.
        """},
    ]

    response = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=messages
    )

    return response.choices[0].message.content.split('\n')

In [72]:
response = generate_response(query, top_3_RAG)

In [73]:
print("\n".join(response))

Based on the provided resume, the total years of experience is not explicitly mentioned. However, we can analyze the 'Work Experience' section to gather relevant information.

The 'Work Experience' section starts at section 5 with the content: "customer\nsatisfaction\n\u2022 Working experie...". Unfortunately, it seems that the given data doesn't mention a clear start or end date for this work experience. Therefore, we cannot accurately determine the total years of experience from this section.

However, there is another 'Work Experience' section at row 0 with content: "skills and Inter-personal abilities. EDUCATION...   \t Metadatas   \'Chunk_No.\': 2, 'Page_No.': 'Page 1\'". But since it follows an 'EDUCATION' section, the content seems out of context due to a punctuation error in your prompt.

Since we cannot calculate a clear total years of experience based on the available data, I can suggest that you review and ensure accuracy when filling in such sections.


In [74]:
def search(query):

  # Set a threshold for cache search
  threshold = 0.2

  ids = []
  documents = []
  distances = []
  metadatas = []
  results_df = pd.DataFrame()

  # try to find from cache
  cache_results = cache_collection.query(
      query_texts=query,
      n_results=1
  )

  # If the distance is greater than the threshold, then return the results from the main collection.
  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        # Query the collection against the user query and return the top 10 results
        results = collection.query(
        query_texts=query,
        n_results=10
        )

        # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
        # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
        Keys = []
        Values = []

        size = len(results.items())

        for key, val in results.items():
          if val is None:
            continue
          for i in range(size):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


        cache_collection.add(
            documents= [query],
            ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
            metadatas = dict(zip(Keys, Values))
        )

        #print("Not found in cache. Found in main collection.")

        result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
        results_df = pd.DataFrame.from_dict(result_dict)
        return results_df


  # If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        #print("Found in cache!")

        # Create a DataFrame
        return pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })

In [75]:
def apply_cross_encoder(query, df):
  cross_inputs = [[query, response] for response in df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)
  df['Reranked_scores'] = cross_rerank_scores
  return df

In [76]:
def get_topn(n, df):
  top_3_rerank = df.sort_values(by='Reranked_scores', ascending=False)
  return top_3_rerank[["Documents", "Metadatas"]][:n]

In [77]:
query = 'what are the projects in the resume'
df = search(query)
df = apply_cross_encoder(query, df)
df = get_topn(3, df)
response = generate_response(query, df)
print("\n".join(response))

**Project Information from Resume**

After analyzing the provided resume sections, I found information about projects in the following sections:

### Work Experience - Project Associate
The resume mentions the role of a "Position Associate-Projects" with relevant work experience. Unfortunately, the specific details about this project are not fully provided in the given content.

However, there is an additional section that provides more explicit information:

### Projects
One section explicitly states:
`• Played as an Associate Projects and involved...`

Unfortunately, the text is truncated, but it appears to be a brief description of one or possibly multiple projects. The project details are limited due to textual truncation in the provided content.

While there are no fully detailed project descriptions with dates, job titles, or specific achievements available from the resume sections, the information mentioned suggests that there were relevant projects involved during this role.


In [78]:
import gradio as gr
import pandas as pd

def chatbot_response(query):
    df = search(query)
    df = apply_cross_encoder(query, df)
    df = get_topn(3, df)
    response = generate_response(query, df)
    return "\n".join(response)

# Create the Gradio interface
iface = gr.Interface(
    fn=chatbot_response,
    inputs=gr.Textbox(lines=2, placeholder="Ask about the resume..."),
    outputs=gr.Textbox(),
    title="Resume Insights Chatbot",
    description="Ask me anything about the resume, and I'll provide answers based on the extracted document!"
)

# Launch the chatbot with a shareable link
iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7874
* Running on public URL: https://64a33218037fc9c4dd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


