<a href="https://colab.research.google.com/github/Ujjwalpreet-Singh/sih-googlecolab/blob/main/Similarity_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pypdf python-docx pytesseract Pillow google-generativeai ipywidgets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json # Added import for json

# 2. Construct the full paths to the 'processed resumes' and 'processed jobs' subfolders
resumes_folder_path = "/content/drive/MyDrive/SIH-Project/processed_resumes"
jobs_folder_path = "/content/drive/MyDrive/SIH-Project/processed-jobs"

print(f"\nAttempting to access resume folder: {resumes_folder_path}")
print(f"Attempting to access jobs folder: {jobs_folder_path}")

# 3. List the contents of both subfolders to confirm that files are present
print("\nListing contents of 'processed resumes' folder:")
if os.path.exists(resumes_folder_path):
    resume_files = [f for f in os.listdir(resumes_folder_path) if f.endswith('.txt') or f.endswith('.pdf') or f.endswith('json')]
    if resume_files:
        print(resume_files)
    else:
        print("No resume files found or folder is empty.")
else:
    print(f"Error: '{resumes_folder_path}' not found.")
    resume_files = []

print("\nListing contents of 'processed jobs' folder:")
if os.path.exists(jobs_folder_path):
    job_files = [f for f in os.listdir(jobs_folder_path) if f.endswith('.txt') or f.endswith('.pdf') or f.endswith('json')]
    if job_files:
        print(job_files)
    else:
        print("No job files found or folder is empty.")
else:
    print(f"Error: '{jobs_folder_path}' not found.")
    job_files = []

# 4. From the 'processed resumes' folder, select one resume file and read its content
selected_resume_content = ""
if resume_files:
    # Assuming the resume is also a JSON file as per previous execution
    selected_resume_file = os.path.join(resumes_folder_path, resume_files[0])
    try:
        if selected_resume_file.endswith('.json'):
            with open(selected_resume_file, 'r', encoding='utf-8') as f:
                resume_data = json.load(f)
                # Extract relevant text from the resume JSON for embedding
                # For now, let's just use the entire JSON string representation if it's complex
                # or pick specific fields. For simplicity, we'll convert back to string.
                # A more robust solution would be to define which fields are relevant for embedding.
                selected_resume_content = json.dumps(resume_data) # Keep as string for embedding
        else: # Handle .txt or .pdf files as before
            with open(selected_resume_file, 'r', encoding='utf-8') as f:
                selected_resume_content = f.read()
        print(f"\nSuccessfully read resume file: {resume_files[0]}")
        print("First few lines of resume content:")
        print('\n'.join(selected_resume_content.splitlines()[:5]))
    except Exception as e:
        print(f"Error reading resume file {selected_resume_file}: {e}")
else:
    print("\nNo resume files available to read.")

# 5. From the 'processed jobs' folder, iterate through all files and read their content
job_postings_content = []
job_postings_raw_data = [] # New list to store parsed job objects
if job_files:
    for job_file_name in job_files:
        job_file_path = os.path.join(jobs_folder_path, job_file_name)
        try:
            if job_file_name.endswith('.json'):
                with open(job_file_path, 'r', encoding='utf-8') as f:
                    json_data = json.load(f)
                    # Assuming json_data is a list of job dictionaries
                    for job_obj in json_data:
                        job_text = ''
                        # Extract text for embedding - combine relevant field
                        if 'skills' in job_obj and isinstance(job_obj['skills'], list):
                            job_text += ' ' + ', '.join(job_obj['skills'])
                        job_postings_content.append(job_text.strip())
                        job_postings_raw_data.append(job_obj) # Store the original job dict
            else: # Handle .txt or .pdf files as before
                with open(job_file_path, 'r', encoding='utf-8') as f:
                    job_postings_content.append(f.read())
        except Exception as e:
            print(f"Error reading job file {job_file_path}: {e}")
    print(f"\nSuccessfully read {len(job_postings_content)} individual job postings.")
    if job_postings_content:
        print("First job posting content snippet:")
        print(job_postings_content[0][:200] + '...') # Print first 200 chars
else:
    print("\nNo job files available to read.")

print(job_postings_content)


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
print(job_postings_content)
print(selected_resume_content)
#
# 3. Load a pre-trained Sentence-BERT model
print("Loading Sentence-BERT model 'all-MiniLM-L6-v2'...")
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
print("Model loaded successfully.")

# 4. Generate the embedding for the selected resume content
resume_embedding = None
if selected_resume_content:
    print("Generating embedding for the selected resume...")
    resume_embedding = model.encode(selected_resume_content)
    print(f"Resume embedding generated with shape: {resume_embedding.shape}")
else:
    print("Warning: No resume content found. Please ensure the file reading step was successful (cell ID cde87e8e).")

# 5. Generate embeddings for all job postings
job_embeddings = []
if job_postings_content:
    print(f"Generating embeddings for {len(job_postings_content)} job postings...")
    job_embeddings = model.encode(job_postings_content)
    print(f"Job embeddings generated with shape: {job_embeddings.shape}")
else:
    print("Warning: No job postings content found. Please ensure the file reading step was successful (cell ID cde87e8e).")

In [None]:
import json

# 1. Extract and combine focused content from resume_data
focused_resume_content = []

# Extract skills
if 'skills' in resume_data and isinstance(resume_data['skills'], list):
    focused_resume_content.extend(resume_data['skills'])

# Extract experience descriptions
if 'experience' in resume_data and isinstance(resume_data['experience'], list):
    for exp in resume_data['experience']:
        if 'description' in exp:
            focused_resume_content.append(exp['description'])

focused_resume_content_str = ' '.join(focused_resume_content).strip()

print(f"\nGenerated focused resume content (first 200 chars): {focused_resume_content_str[:200]}...")

# 2. Extract and combine focused content from job_postings_raw_data
focused_job_contents = []
for job_obj in job_postings_raw_data:
    job_focused_text = []
    if 'skills' in job_obj and isinstance(job_obj['skills'], list):
        job_focused_text.extend(job_obj['skills'])
    if 'responsibilities' in job_obj and isinstance(job_obj['responsibilities'], list):
        job_focused_text.extend(job_obj['responsibilities'])
    focused_job_contents.append(' '.join(job_focused_text).strip())

print(f"Generated {len(focused_job_contents)} focused job contents. First one (first 200 chars): {focused_job_contents[0][:200]}...")

# 3. Generate embedding for focused_resume_content
focused_resume_embedding = None
if focused_resume_content_str:
    print("\nGenerating embedding for focused resume content...")
    focused_resume_embedding = model.encode(focused_resume_content_str)
    print(f"Focused Resume embedding generated with shape: {focused_resume_embedding.shape}")
else:
    print("Warning: No focused resume content found.")

# 4. Generate embeddings for focused_job_contents
focused_job_embeddings = []
if focused_job_contents:
    print(f"\nGenerating embeddings for {len(focused_job_contents)} focused job postings...")
    focused_job_embeddings = model.encode(focused_job_contents)
    print(f"Focused Job embeddings generated with shape: {focused_job_embeddings.shape}")
else:
    print("Warning: No focused job postings content found.")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Reshape focused_resume_embedding for compatibility with cosine_similarity if it's 1D
if focused_resume_embedding.ndim == 1:
    focused_resume_embedding_reshaped = focused_resume_embedding.reshape(1, -1)
else:
    focused_resume_embedding_reshaped = focused_resume_embedding

# Calculate cosine similarity
focused_similarity_scores = cosine_similarity(focused_resume_embedding_reshaped, focused_job_embeddings)

print("Cosine Similarity Scores (Focused Resume vs. Focused Job Postings):")
print(focused_similarity_scores)

In [None]:
num_top_matches = 5 # Display top 5 matches, or fewer if less are available
similarity_threshold_low = 0.55 # Define lower bound for 'around 0.5'
similarity_threshold_high = 0.95 # Define upper bound for 'around 0.5'

# Create a list of (score, job_posting_raw_data) tuples
# Ensure job_postings_raw_data and focused_similarity_scores match in length
if len(job_postings_raw_data) == focused_similarity_scores.shape[1]:
    scored_focused_jobs = []
    for i in range(len(job_postings_raw_data)):
        scored_focused_jobs.append((focused_similarity_scores[0, i], job_postings_raw_data[i]))

    # Sort the jobs by similarity score in descending order
    scored_focused_jobs.sort(key=lambda x: x[0], reverse=True)

    print(f"\nTop {min(num_top_matches, len(scored_focused_jobs))} Matching Job Postings (Focused Embeddings) around {similarity_threshold_low}-{similarity_threshold_high} similarity mark:\n")

    filtered_jobs_count = 0
    for score, job_data in scored_focused_jobs:
        if similarity_threshold_low <= score <= similarity_threshold_high:
            if filtered_jobs_count < num_top_matches:
                filtered_jobs_count += 1
                print(f"--- Rank {filtered_jobs_count} ---")
                print(f"Similarity Score: {score:.4f}")
                print(f"Job Title: {job_data.get('job_title', 'N/A')}")
                print(f"Company: {job_data.get('company_name', 'N/A')}")

                # Display skills if available
                skills = job_data.get('skills', 'N/A')
                if isinstance(skills, list):
                    print(f"Skills: {', '.join(skills)}")
                else:
                    print(f"Skills: {skills}")

                # Display a snippet of the original description
                description_snippet = job_data.get('original_description', 'N/A')
                if len(description_snippet) > 200:
                    description_snippet = description_snippet[:200] + '...'
                print(f"Description Snippet: {description_snippet}")
                print("---------------------\n")
            else:
                break # Stop if we have found 5 matching jobs

    if filtered_jobs_count == 0:
        print("No job postings found within the specified similarity range.")

else:
    print("Error: Mismatch between number of job postings and focused similarity scores.")
    print(f"Jobs found: {len(job_postings_raw_data)}, Scores found: {focused_similarity_scores.shape[1]}")

In [None]:
all_unique_skills = set()

# 2. Extract skills from resume_data
if 'skills' in resume_data and isinstance(resume_data['skills'], list):
    all_unique_skills.update(resume_data['skills'])
    print(f"Added {len(resume_data['skills'])} skills from resume_data.")
else:
    print("No skills found in resume_data or skills not in list format.")

# 3. Iterate through job_postings_raw_data and extract skills
job_skills_count = 0
for job_obj in job_postings_raw_data:
    if 'skills' in job_obj and isinstance(job_obj['skills'], list):
        all_unique_skills.update(job_obj['skills'])
        job_skills_count += len(job_obj['skills'])
print(f"Added {job_skills_count} skills (including duplicates) from all job postings.")

# 4. Convert the set to a list and print summary
all_unique_skills_list = list(all_unique_skills)
print(f"\nTotal number of unique skills collected: {len(all_unique_skills_list)}")
print(f"First 10 unique skills: {all_unique_skills_list[:10]}")

In [None]:
print(f"Generating embeddings for {len(all_unique_skills_list)} unique skills...")
skill_embeddings = model.encode(all_unique_skills_list, show_progress_bar=True)
print(f"Skill embeddings generated with shape: {skill_embeddings.shape}")

In [None]:
applicant_skills_for_embedding = []
if 'skills' in resume_data and isinstance(resume_data['skills'], list):
    applicant_skills_for_embedding = resume_data['skills']

print(f"Generating embeddings for {len(applicant_skills_for_embedding)} applicant skills...")
applicant_skill_embeddings = model.encode(applicant_skills_for_embedding, show_progress_bar=True)
print(f"Applicant skill embeddings generated with shape: {applicant_skill_embeddings.shape}")

In [None]:
semantic_similarity_threshold = 0.5 # Define a semantic similarity threshold for individual skills

print(f"\nIdentifying semantically missing skills for relevant job postings (semantic similarity threshold: {semantic_similarity_threshold}):\n")

filtered_jobs_count = 0
for score, job_data in scored_focused_jobs:
    if 0.55 <= score <= 0.70:
        if filtered_jobs_count < num_top_matches:
            filtered_jobs_count += 1
            print(f"--- Rank {filtered_jobs_count} ---")
            print(f"Overall Similarity Score: {score:.4f}")
            print(f"Job Title: {job_data.get('job_title', 'N/A')}")

            job_skills_list = job_data.get('skills', [])
            semantically_missing_skills = []

            if isinstance(job_skills_list, list) and job_skills_list:
                for job_skill in job_skills_list:
                    # Get the embedding for the current job skill
                    try:
                        job_skill_index = all_unique_skills_list.index(job_skill)
                        job_skill_embedding = skill_embeddings[job_skill_index]

                        # Calculate cosine similarity between job_skill_embedding and all applicant_skill_embeddings
                        if applicant_skill_embeddings.size > 0: # Ensure there are applicant skills to compare against
                            # Reshape for cosine_similarity function if needed
                            job_skill_embedding_reshaped = job_skill_embedding.reshape(1, -1)
                            individual_skill_similarities = cosine_similarity(job_skill_embedding_reshaped, applicant_skill_embeddings)

                            # Find the maximum similarity score for this job skill against any applicant skill
                            max_similarity_to_applicant_skills = np.max(individual_skill_similarities)

                            # If the maximum similarity is below the semantic threshold, consider it missing
                            if max_similarity_to_applicant_skills < semantic_similarity_threshold:
                                semantically_missing_skills.append(job_skill)
                        else:
                            # If no applicant skills, all job skills are effectively 'missing'
                            semantically_missing_skills.append(job_skill)

                    except ValueError:
                        # This should ideally not happen if all job skills were added to all_unique_skills_list
                        # but as a fallback, if a skill isn't in our embedded list, it's 'missing'
                        semantically_missing_skills.append(job_skill)

                if semantically_missing_skills:
                    print(f"Semantically missing skills for this role: {', '.join(semantically_missing_skills)}")
                else:
                    print("No semantically missing skills identified for this role.")
            else:
                print("No skills listed for this job posting or format is incorrect.")
            print("---------------------\n")
        else:
            break

if filtered_jobs_count == 0:
    print("No job postings found within the specified overall similarity range (0.45-0.55).")

In [None]:
import sys
sys.path.append('/content')
from resume_parser import process
process("resume.txt")