# Resume Semantic Search Project

In [None]:
pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu118

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)

In [None]:
#Installing required packages 

pip install pandas faiss-cpu streamlit sentence-transformers requests pdfplumber tqdm -q

In [None]:
# Importing required libraries

import os
import requests
import pdfplumber
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import streamlit as st

In [None]:
# Function to download file from Google Drive using file ID

def download_file_from_google_drive(file_id, destination):
    URL = "https://drive.google.com/drive/folders/1A0GPGrH7rLlAFNJ-RftYzLp28fJ0kKSU"  # URL of the Google Drive folder
    session = requests.Session()  # Initializing a session to maintain connection
    response = session.get(URL, params={'id': file_id}, stream=True) # Send request to get file with specific file_id
    token = get_confirm_token(response) # Check for token to confirm download
    
    if token:
        # If token found, send request again with confirmation token
        params = {'id': file_id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    # Save content of response to destination file
    save_response_content(response, destination)    

# Function to extract confirmation token from response cookies
def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

# Function to save response content to a file
def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    # Write chunks of content to destination file
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:
                f.write(chunk)

# Function to load resumes from Google Drive links
def load_resumes_from_links(links):
    resumes = []
    for link in links:
        # Extract file_id from Google Drive link
        file_id = link.split('/')[-2]
        # Define destination file path as file_id.pdf
        destination = f"{file_id}.pdf"

        # Download file from Google Drive and append file path to resumes list
        download_file_from_google_drive(file_id, destination)
        resumes.append(destination)
    return resumes


In [None]:
# Function to extract text from PDFs using pdfplumber
def extract_text_from_pdf(pdf_path):
    try:
        # Open PDF file with pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
             # Iterate through each page in the PDF and extract text
            for page in pdf.pages:
                text += page.extract_text() or ''
        return text
    except Exception as e:
        # Handle exceptions if PDF cannot be read
        print(f"Error reading {pdf_path}: {e}")
        return ""

# Function to embed resumes into vectors using SentenceTransformer
def embed_resumes(resumes):
    # Initializing SentenceTransformer model for embedding resumes
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = []
    # Iterate through each resume file path
    for resume in resumes:
        # Extract text from PDF resume
        text = extract_text_from_pdf(resume)
        # If text extraction successful, encode text into embedding vector
        if text:  # Proceed only if text extraction was successful
            embedding = model.encode(text)
            embeddings.append(embedding)
    return embeddings


In [None]:
# Function to build FAISS index for embeddings
def build_faiss_index(embeddings):
    # Determine dimensionality of embeddings
    d = len(embeddings[0])
    # Initialize FAISS index with L2 distance metric
    index = faiss.IndexFlatL2(d)
    # Add embeddings to FAISS index
    index.add(np.array(embeddings))
    
    return index


In [None]:
# Function to search resumes using FAISS index and return results
def search_resumes(query, index, resumes):
    # Initialize SentenceTransformer model for embedding queries
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # Encode query into embedding vector
    query_embedding = model.encode(query)
    # Search FAISS index for nearest neighbors to query embedding
    D, I = index.search(np.array([query_embedding]), k=10)
    # Return resumes corresponding to nearest neighbor indices
    return [resumes[i] for i in I[0]]


In [None]:
# Function to display search results in console
def display_results_console(query, results):
    print(f"Query: {query}")
    for result in results:
        print(result)

In [None]:
# Function to display search results in Streamlit interface
def main():
    # Set title for Streamlit web application
    st.title("Resume Semantic Search")
    
    # Create text input box for user to enter search query
    query = st.text_input("Enter your query:")
    
    # Create button for user to trigger search
    if st.button("Search"):
        # Define example Google Drive link(s) containing resumes
        links = ["https://drive.google.com/drive/folders/1A0GPGrH7rLlAFNJ-RftYzLp28fJ0kKSU"]
        
        # Load resumes from Google Drive links
        resumes = load_resumes_from_links(links)
        
        # Embed resumes into vector representations
        embeddings = embed_resumes(resumes)
        
        # If embeddings successfully created, build FAISS index
        if embeddings:
            index = build_faiss_index(embeddings)
            
            # Perform semantic search based on user query
            results = search_resumes(query, index, resumes)
            
            # Display search results in Streamlit interface
            for result in results:
                st.write(result)
                
            # Display search results in console
            display_results_console(query, results)
        else:
            st.write("No valid resumes found or embeddings were not created.")


# Example usage in Streamlit application
if __name__ == "__main__":
    main()