# Resume Semantic Search Project

In [1]:
pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)

cuda


In [None]:
#Installing required packages 

pip install langchain-openai faiss-cpu streamlit pdfplumber -q

In [16]:
# Importing required libraries

import os
import faiss
import numpy as np
import streamlit as st
import pdfplumber
from langchain_openai import OpenAIEmbeddings
# from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI

In [8]:
# Load environment variables

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

True

In [9]:
# Function to load resumes from a folder and extract text

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''.join(page.extract_text() for page in pdf.pages)
    return text

def load_resumes(resume_folder):
    resumes = []
    for filename in os.listdir(resume_folder):
        if filename.endswith('.pdf'):
            file_path = os.path.join(resume_folder, filename)
            text = extract_text_from_pdf(file_path)
            resumes.append((filename, text))
    return resumes


In [10]:
# Function to generate embeddings for each resume

def generate_embeddings(resumes, embedding_model):
    embeddings = []
    for filename, text in resumes:
        embedding = embedding_model.embed_query(text)
        embeddings.append((filename, embedding))
    return embeddings

# Function to create a FAISS index from the embeddings

def create_faiss_index(embeddings):
    embedding_vectors = np.array([embed for _, embed in embeddings]).astype('float32')
    index = faiss.IndexFlatL2(embedding_vectors.shape[1])
    index.add(embedding_vectors)
    return index

In [11]:
# Function to search for resumes based on a query

def search_resumes(query, embedding_model, index, embeddings, top_k=5):
    query_embedding = np.array([embedding_model.embed_query(query)]).astype('float32')
    distances, indices = index.search(query_embedding, top_k)
    
    results = [(embeddings[idx][0], 1 / (1 + distances[0][i])) for i, idx in enumerate(indices[0])]
    return results

In [12]:
# Console-based search function that keeps asking for queries

def console_search(resume_folder, embedding_model, index, embeddings):
    while True:
        query = input("\nEnter your search query (or press Enter to exit): ")
        if not query.strip():
            print("Exiting the program.")
            break
        
        results = search_resumes(query, embedding_model, index, embeddings)
        
        print("\nSearch Results:")
        if results:
            for filename, score in results:
                print(f"Filename: {filename}, Similarity Score: {score:.4f}")
        else:
            print("No matching resumes found.")


In [13]:
# Streamlit-based search function for a web interface

def streamlit_search(resume_folder, embedding_model, index, embeddings):
    st.title("Resume Semantic Search")

    query = st.text_input("Enter your query:")
    top_k = st.slider("Number of results:", 1, 20, 5)

    if st.button("Search"):
        results = search_resumes(query, embedding_model, index, embeddings, top_k)
        
        st.write("Search Results:")
        if results:
            for filename, score in results:
                st.write(f"**{filename}** - Similarity Score: {score:.4f}")
        else:
            st.write("No matching resumes found.")

In [14]:
# Main function to run the search tool

if __name__ == "__main__":
    resume_folder = r"C:\Users\rahul\OneDrive\Desktop\DS RESUME\Bangalore"
    
    # Load the embedding model
    embedding_model = OpenAIEmbeddings()  # or any other LangChain-compatible embedding model

    # Load resumes and generate embeddings
    resumes = load_resumes(resume_folder)
    embeddings = generate_embeddings(resumes, embedding_model)

    # Create a FAISS index from the embeddings
    index = create_faiss_index(embeddings)
    
    # For console-based search
    console_search(resume_folder, embedding_model, index, embeddings)
    


Enter your search query (or press Enter to exit):  power bi



Search Results:
Filename: RAHULKISHORE_Data Analyst.pdf, Similarity Score: 0.6596
Filename: RAHULKISHORE_AI Engineer.pdf, Similarity Score: 0.6465
Filename: RAHULKISHORE_Data Scientist.pdf, Similarity Score: 0.6461
Filename: RAHULKISHORE_ML Engineer.pdf, Similarity Score: 0.6416
Filename: RAHULKISHORE_Python Developer.pdf, Similarity Score: 0.6383



Enter your search query (or press Enter to exit):  


Exiting the program.


In [None]:
# For Streamlit-based search

streamlit_search(resume_folder, embedding_model, index, embeddings)