In [1]:
# Cell 1: Install Dependencies
%pip install pdfplumber pymongo google-generativeai dnspython pytesseract Pillow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Cell 2: Import Libraries and Setup
import os
import json
import logging
import re
from datetime import datetime
from pymongo import MongoClient
from PIL import Image

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Hardcode credentials
GOOGLE_API_KEY = "AIzaSyBvRnSojVCuojgtGI7RisnW6-S4VpBYJWo"  # Your provided Gemini API key
MONGODB_URI = "mongodb+srv://shashi:VSXV9WDNmRvYnA7p@clusterskillgapanalysis.vnbcnju.mongodb.net/skillgapanalysis?retryWrites=true&w=majority"

# Import dependencies
try:
    import pdfplumber
except ImportError:
    raise ImportError("pdfplumber is not installed. Run: %pip install pdfplumber")

try:
    import google.generativeai as genai
except ImportError:
    raise ImportError("google-generativeai is not installed. Run: %pip install google-generativeai")

try:
    import pytesseract
    # Explicitly set Tesseract path (adjust if installed elsewhere)
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
except ImportError:
    raise ImportError("pytesseract or PIL is not installed. Run: %pip install pytesseract Pillow")

In [3]:
# Cell 3: MongoDB Connection
def connect_to_mongodb():
    try:
        client = MongoClient(MONGODB_URI)
        db = client["skillgapanalysis"]
        collection = db["jobrole_skill"]
        collection.create_index("Job_Role")
        logging.info("Connected to MongoDB Atlas")
        return collection
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB: {str(e)}")
        raise ValueError(f"Failed to connect to MongoDB: {str(e)}")

In [4]:

# Cell 4: Enhanced CV Text Extraction
def extract_cv_text(cv_path):
    try:
        with pdfplumber.open(cv_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n\n"
                    logging.info(f"Extracted text from page {page.page_number} using pdfplumber")
                else:
                    logging.warning(f"No text extracted from page {page.page_number}. Using OCR.")
                    try:
                        img = page.to_image().original
                        ocr_text = pytesseract.image_to_string(img)
                        text += ocr_text + "\n\n"
                        logging.info(f"Extracted OCR text from page {page.page_number}")
                    except Exception as ocr_e:
                        logging.error(f"OCR failed for page {page.page_number}: {str(ocr_e)}")
                        text += "\n\n"
            if not text.strip():
                raise ValueError("No text extracted from CV. Check PDF format or Tesseract installation.")
            logging.info(f"Extracted text from CV: {text[:100]}...")
            return text
    except pdfplumber.pdfminer.pdfdocument.PDFPasswordIncorrect:
        logging.error("PDF is password-protected. Provide the password or use an unprotected PDF.")
        raise ValueError("PDF is password-protected")
    except Exception as e:
        logging.error(f"Error reading CV: {str(e)}")
        raise ValueError(f"Error reading CV: {str(e)}")

# Cell 5: Skills Section Extraction
def extract_skills_section(cv_text):
    try:
        match = re.search(
            r"(skills|technical skills|key skills|core competencies):?\s*(.*?)(?=\n\s*(experience|education|projects|contact|certifications|references|$|\n\n))",
            cv_text, re.IGNORECASE | re.DOTALL
        )
        if match:
            skills_text = match.group(2).strip()
            skills_lines = [line.strip() for line in skills_text.split('\n') if line.strip() and not re.match(r'^\s*$', line)]
            cleaned_skills = ' '.join(skills_lines).strip()
            logging.info(f"Extracted skills section: {cleaned_skills}")
            return cleaned_skills if cleaned_skills else cv_text
        logging.warning("No explicit skills section found. Using full CV text for Gemini.")
        return cv_text
    except Exception as e:
        logging.error(f"Error extracting skills section: {str(e)}")
        return cv_text

# Cell 6: Gemini Skill Extraction
def extract_skills_with_gemini(cv_text):
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = (
            "You are an expert in CV analysis. Extract all skills from the following CV text, including: "
            "1. Explicit skills listed in a 'Skills', 'Technical Skills', or similar section (e.g., 'Python, TensorFlow'). "
            "2. Implicit skills inferred from 'Projects', 'Experience', or similar sections (e.g., 'Built a fraud detection model using scikit-learn' implies 'scikit-learn'). "
            "Return a plain JSON array of clean, distinct skills (e.g., capitalize 'PyTorch', 'AWS SageMaker'). "
            "Combine similar skills (e.g., 'ML' and 'Machine Learning' as 'Machine Learning'). "
            "Return [] if no skills are found or input is empty. "
            "Output must be a valid JSON array without ```json, backticks, or any other formatting. "
            "Example: Input: 'Skills: Python, ML\nProjects: Built a model using TensorFlow' "
            "Output: [\"Python\", \"Machine Learning\", \"TensorFlow\"]"
            "\n\nCV Text:\n" + cv_text
        )
        response = model.generate_content(prompt)
        raw_response = response.text.strip()
        logging.info(f"Gemini raw response: {raw_response}")
        cleaned_response = raw_response
        if cleaned_response.startswith("```json\n"):
            cleaned_response = cleaned_response[8:].strip()
        elif cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:].strip()
        if cleaned_response.endswith("\n```"):
            cleaned_response = cleaned_response[:-4].strip()
        elif cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3].strip()
        logging.info(f"Cleaned Gemini response: {cleaned_response}")
        try:
            skills = json.loads(cleaned_response)
            if not isinstance(skills, list):
                logging.error("Gemini response is not a JSON array")
                return []
            logging.info(f"Extracted skills: {skills}")
            return skills
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse cleaned Gemini response as JSON: {cleaned_response}, Error: {str(e)}")
            return []
    except Exception as e:
        logging.error(f"Error in Gemini skill extraction: {str(e)}")
        return []

# Cell 7: MongoDB Query
def get_required_skills(job_collection, job_role):
    try:
        job_doc = job_collection.find_one({"Job_Role": job_role})
        if not job_doc:
            raise ValueError(f"Job role '{job_role}' not found in database")
        required_skills = [skill.strip() for skill in job_doc["Required_Skills"].split(",")]
        logging.info(f"Required skills for {job_role}: {required_skills}")
        return required_skills
    except Exception as e:
        logging.error(f"Error retrieving required skills: {str(e)}")
        raise ValueError(f"Error retrieving required skills: {str(e)}")

# Cell 8: Skill Gap Analysis with Gemini Semantic Similarity
def skill_gap_analysis(cv_skills, required_skills):
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = (
            "You are an expert in skill gap analysis. Given two lists of skills: "
            "1. CV skills (from a candidate's CV). "
            "2. Required skills (for a job role). "
            "Identify which required skills are missing from the CV skills, considering semantic similarity and synonyms. "
            "Treat skills as equivalent if they have the same or similar meaning (e.g., 'ML' ≈ 'Machine Learning', "
            "'SQL' ≈ 'Database Management', 'Statistical Analysis' ≈ 'Statistics', 'Deep Learning' ≈ 'Neural Networks'). "
            "Return a plain JSON array of the missing required skills, preserving their original names from the required skills list. "
            "Output must be a valid JSON array without ```json or backticks. "
            "Example: "
            "CV skills: ['Python', 'SQL', 'Deep Learning'] "
            "Required skills: ['Database Management', 'Neural Networks', 'Data Pipelines'] "
            "Output: ['Data Pipelines'] "
            "\n\nCV Skills:\n" + json.dumps(cv_skills) +
            "\nRequired Skills:\n" + json.dumps(required_skills)
        )
        response = model.generate_content(prompt)
        raw_response = response.text.strip()
        logging.info(f"Gemini raw response for skill gap: {raw_response}")
        cleaned_response = raw_response
        if cleaned_response.startswith("```json\n"):
            cleaned_response = cleaned_response[8:].strip()
        elif cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:].strip()
        if cleaned_response.endswith("\n```"):
            cleaned_response = cleaned_response[:-4].strip()
        elif cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3].strip()
        logging.info(f"Cleaned Gemini response for skill gap: {cleaned_response}")
        try:
            missing_skills = json.loads(cleaned_response)
            if not isinstance(missing_skills, list):
                logging.error("Gemini response is not a JSON array")
                return []
            logging.info(f"Missing skills: {missing_skills}")
            return missing_skills
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse Gemini response: {cleaned_response}, Error: {str(e)}")
            return []
    except Exception as e:
        logging.error(f"Error in skill gap analysis: {str(e)}")
        return []



In [5]:
# Cell 9: Experience, Skill Weighting, and Ranking
def extract_experience(cv_text):
    try:
        # Regex for explicit years of experience
        year_patterns = [
            r"(\d+)\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|professional experience)",
            r"(\d+)\s*\+?\s*years?\s*(?:of\s*)?(?:experience|professional experience)",
            r"experience\s*:\s*(\d+)\s*years?"
        ]
        for pattern in year_patterns:
            match = re.search(pattern, cv_text, re.IGNORECASE)
            if match:
                years = int(match.group(1))
                logging.info(f"Extracted experience: {years} years")
                return years

        # Fallback to Gemini for ambiguous cases (e.g., "Senior Data Scientist since 2010")
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = (
            "Extract the total years of professional experience from the following CV text. "
            "Look for explicit mentions (e.g., '10 years of experience') or infer from job history (e.g., 'Senior Data Scientist since 2010'). "
            "Return a single integer representing total years of experience. If no experience is found, return 0. "
            "Output only the integer, nothing else. "
            "Current year is 2025. "
            "\n\nCV Text:\n" + cv_text[:2000]  # Limit to avoid token limits
        )
        response = model.generate_content(prompt)
        years = int(response.text.strip())
        logging.info(f"Gemini extracted experience: {years} years")
        return years
    except Exception as e:
        logging.error(f"Error extracting experience: {str(e)}")
        return 0

def assign_experience_weight(years):
    if years > 15:
        return 3  # High
    elif 8 <= years <= 15:
        return 2  # Medium
    elif 1 <= years <= 7:
        return 1  # Low
    return 0  # None or <1 year

def assign_skill_weights(job_role, required_skills):
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        num_skills = len(required_skills)
        # Dynamic weights: highest is num_skills, lowest is 1
        prompt = (
            f"You are an expert in recruitment for the job role '{job_role}'. "
            f"Given the following required skills: {json.dumps(required_skills)}, "
            f"assign a weight to each skill based on its relevance to the job role. "
            f"Use integers from 1 (least relevant) to {num_skills} (most relevant), ensuring each skill gets a unique weight. "
            "Consider technical skills (e.g., 'Python' for Data Scientist) as more relevant than soft or unrelated skills (e.g., 'Hiking'). "
            "Return a JSON object mapping each skill to its weight. "
            "Output must be a valid JSON object without ```json or backticks. "
            "Example: "
            "Job role: Data Scientist "
            "Required skills: ['Python', 'Report Writing', 'Hiking'] "
            "Output: {\"Python\": 3, \"Report Writing\": 2, \"Hiking\": 1}"
            "\n\nRequired Skills:\n" + json.dumps(required_skills)
        )
        response = model.generate_content(prompt)
        raw_response = response.text.strip()
        logging.info(f"Gemini raw response for skill weights: {raw_response}")
        cleaned_response = raw_response
        if cleaned_response.startswith("```json\n"):
            cleaned_response = cleaned_response[8:].strip()
        elif cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:].strip()
        if cleaned_response.endswith("\n```"):
            cleaned_response = cleaned_response[:-4].strip()
        elif cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3].strip()
        logging.info(f"Cleaned Gemini response for skill weights: {cleaned_response}")
        try:
            weights = json.loads(cleaned_response)
            if not isinstance(weights, dict):
                logging.error("Gemini response is not a JSON object")
                return {skill: 1 for skill in required_skills}  # Fallback: equal weights
            logging.info(f"Skill weights: {weights}")
            return weights
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse Gemini response: {cleaned_response}, Error: {str(e)}")
            return {skill: 1 for skill in required_skills}
    except Exception as e:
        logging.error(f"Error assigning skill weights: {str(e)}")
        return {skill: 1 for skill in required_skills}

def rank_cvs(cv_folder, job_role):
    try:
        job_collection = connect_to_mongodb()
        required_skills = get_required_skills(job_collection, job_role)
        skill_weights = assign_skill_weights(job_role, required_skills)
        cv_results = []

        # Process each PDF CV in the folder
        for cv_file in os.listdir(cv_folder):
            if cv_file.lower().endswith('.pdf'):
                cv_path = os.path.join(cv_folder, cv_file)
                try:
                    # Extract text and skills
                    cv_text = extract_cv_text(cv_path)
                    skills_text = extract_skills_section(cv_text)
                    cv_skills = extract_skills_with_gemini(skills_text)
                    if not cv_skills:
                        logging.warning(f"No skills extracted from {cv_file}")
                        continue

                    # Extract experience
                    years = extract_experience(cv_text)
                    exp_weight = assign_experience_weight(years)

                    # Skill gap analysis
                    missing_skills = skill_gap_analysis(cv_skills, required_skills)

                    # Calculate score
                    skill_score = sum(skill_weights.get(skill, 0) for skill in cv_skills if skill in required_skills)
                    total_score = exp_weight + skill_score

                    cv_results.append({
                        "cv_file": cv_file,
                        "years_experience": years,
                        "experience_weight": exp_weight,
                        "cv_skills": cv_skills,
                        "required_skills": required_skills,
                        "missing_skills": missing_skills,
                        "skill_score": skill_score,
                        "total_score": total_score
                    })
                    logging.info(f"Processed {cv_file}: Score = {total_score}")
                except Exception as e:
                    logging.error(f"Error processing {cv_file}: {str(e)}")
                    continue

        # Rank CVs by total_score
        ranked_cvs = sorted(cv_results, key=lambda x: x["total_score"], reverse=True)
        logging.info(f"Ranked {len(ranked_cvs)} CVs")
        return ranked_cvs
    except Exception as e:
        logging.error(f"Error in CV ranking: {str(e)}")
        return []


In [6]:
# Cell 10: Run CV Ranking
cv_folder = r"E:\Individual Projects\Skill_gap_system\testcvs"  # Adjust to your CV folder
job_role = "Data Scientist"
ranked_cvs = rank_cvs(cv_folder, job_role)
print("Ranked CVs:")
for i, cv in enumerate(ranked_cvs, 1):
    print(f"Rank {i}: {cv['cv_file']}")
    print(f"  Years of Experience: {cv['years_experience']} (Weight: {cv['experience_weight']})")
    print(f"  CV Skills: {cv['cv_skills']}")
    print(f"  Missing Skills: {cv['missing_skills']}")
    print(f"  Skill Score: {cv['skill_score']}")
    print(f"  Total Score: {cv['total_score']}")
    print()

2025-04-17 21:25:42,380 - INFO - Connected to MongoDB Atlas
2025-04-17 21:25:42,688 - INFO - Required skills for Data Scientist: ['ETL', 'Excel', 'A/B Testing', 'Data Visualization', 'Big Data']
2025-04-17 21:25:43,814 - INFO - Gemini raw response for skill weights: {
  "Big Data": 5,
  "ETL": 4,
  "Data Visualization": 3,
  "A/B Testing": 2,
  "Excel": 1
}
2025-04-17 21:25:43,815 - INFO - Cleaned Gemini response for skill weights: {
  "Big Data": 5,
  "ETL": 4,
  "Data Visualization": 3,
  "A/B Testing": 2,
  "Excel": 1
}
2025-04-17 21:25:43,816 - INFO - Skill weights: {'Big Data': 5, 'ETL': 4, 'Data Visualization': 3, 'A/B Testing': 2, 'Excel': 1}
2025-04-17 21:25:44,106 - INFO - Extracted text from page 1 using pdfplumber
2025-04-17 21:25:44,108 - INFO - Extracted text from CV: K A N DA C E L O U D O R
DATA SCIENTIST
CONTACT WORK EXPERIENCE
kloudor@email.com Data Scientist
(12...
2025-04-17 21:25:44,110 - INFO - Extracted skills section: Spectrix Analytical Services March 2016 - Jun

Ranked CVs:
Rank 1: data-scientist-resume-example.pdf
  Years of Experience: 7 (Weight: 1)
  CV Skills: ['Python', 'NumPy', 'Pandas', 'Scikit-learn', 'Keras', 'Flask', 'SQL', 'MySQL', 'Postgres', 'Git', 'Time Series Forecasting', 'Productionizing Models', 'Recommendation Engines', 'AWS', 'Customer Segmentation', 'Cohort Analysis', 'Tableau', 'A/B Testing']
  Missing Skills: ['ETL', 'Excel', 'Data Visualization', 'Big Data']
  Skill Score: 2
  Total Score: 3

Rank 2: Nimantha_Kankanamge_Resume.pdf
  Years of Experience: 1 (Weight: 1)
  CV Skills: ['Statistical Analysis', 'R Studio', 'Teamwork', 'Collaboration', 'Data Analysis', 'Photography', 'Editing', 'Project Management', 'Environmental Science']
  Missing Skills: ['ETL', 'Excel', 'A/B Testing', 'Data Visualization', 'Big Data']
  Skill Score: 0
  Total Score: 1

Rank 3: Shashiprabha_Senanayake_Resume (3).pdf
  Years of Experience: 0 (Weight: 0)
  CV Skills: ['LSTM', 'Streamlit', 'Wix', 'Python', 'Machine Learning', 'Scikit-learn', '

In [None]:
##______________________________(2)Correct

In [7]:
# Cell 1: Install Dependencies
#%pip install pdfplumber pymongo google-generativeai dnspython pytesseract Pillow

# Cell 2: Import Libraries and Setup
import os
import json
import logging
import re
from datetime import datetime
from pymongo import MongoClient
from PIL import Image

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Hardcode credentials
GOOGLE_API_KEY = "AIzaSyBvRnSojVCuojgtGI7RisnW6-S4VpBYJWo"  # Your provided Gemini API key
MONGODB_URI = "mongodb+srv://shashi:VSXV9WDNmRvYnA7p@clusterskillgapanalysis.vnbcnju.mongodb.net/skillgapanalysis?retryWrites=true&w=majority"

# Import dependencies
try:
    import pdfplumber
except ImportError:
    raise ImportError("pdfplumber is not installed. Run: %pip install pdfplumber")

try:
    import google.generativeai as genai
except ImportError:
    raise ImportError("google-generativeai is not installed. Run: %pip install google-generativeai")

try:
    import pytesseract
    # Explicitly set Tesseract path (adjust if installed elsewhere)
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
except ImportError:
    raise ImportError("pytesseract or PIL is not installed. Run: %pip install pytesseract Pillow")

# Cell 3: MongoDB Connection
def connect_to_mongodb():
    try:
        client = MongoClient(MONGODB_URI)
        db = client["skillgapanalysis"]
        collection = db["jobrole_skill"]
        collection.create_index("Job_Role")
        logging.info("Connected to MongoDB Atlas")
        return collection
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB: {str(e)}")
        raise ValueError(f"Failed to connect to MongoDB: {str(e)}")

# Cell 4: Enhanced CV Text Extraction
def extract_cv_text(cv_path):
    try:
        with pdfplumber.open(cv_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n\n"
                    logging.info(f"Extracted text from page {page.page_number} using pdfplumber")
                else:
                    logging.warning(f"No text extracted from page {page.page_number}. Using OCR.")
                    try:
                        img = page.to_image().original
                        ocr_text = pytesseract.image_to_string(img)
                        text += ocr_text + "\n\n"
                        logging.info(f"Extracted OCR text from page {page.page_number}")
                    except Exception as ocr_e:
                        logging.error(f"OCR failed for page {page.page_number}: {str(ocr_e)}")
                        text += "\n\n"
            if not text.strip():
                raise ValueError("No text extracted from CV. Check PDF format or Tesseract installation.")
            logging.info(f"Extracted text from CV: {text[:100]}...")
            return text
    except pdfplumber.pdfminer.pdfdocument.PDFPasswordIncorrect:
        logging.error("PDF is password-protected. Provide the password or use an unprotected PDF.")
        raise ValueError("PDF is password-protected")
    except Exception as e:
        logging.error(f"Error reading CV: {str(e)}")
        raise ValueError(f"Error reading CV: {str(e)}")

# Cell 5: Skills Section Extraction
def extract_skills_section(cv_text):
    try:
        match = re.search(
            r"(skills|technical skills|key skills|core competencies):?\s*(.*?)(?=\n\s*(experience|education|projects|contact|certifications|references|$|\n\n))",
            cv_text, re.IGNORECASE | re.DOTALL
        )
        if match:
            skills_text = match.group(2).strip()
            skills_lines = [line.strip() for line in skills_text.split('\n') if line.strip() and not re.match(r'^\s*$', line)]
            cleaned_skills = ' '.join(skills_lines).strip()
            logging.info(f"Extracted skills section: {cleaned_skills}")
            return cleaned_skills if cleaned_skills else cv_text
        logging.warning("No explicit skills section found. Using full CV text for Gemini.")
        return cv_text
    except Exception as e:
        logging.error(f"Error extracting skills section: {str(e)}")
        return cv_text

# Cell 6: Gemini Skill Extraction
def extract_skills_with_gemini(cv_text):
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = (
            "You are an expert in CV analysis. Extract all skills from the following CV text, including: "
            "1. Explicit skills listed in a 'Skills', 'Technical Skills', or similar section (e.g., 'Python, TensorFlow'). "
            "2. Implicit skills inferred from 'Projects', 'Experience', or similar sections (e.g., 'Built a fraud detection model using scikit-learn' implies 'scikit-learn'). "
            "Return a plain JSON array of clean, distinct skills (e.g., capitalize 'PyTorch', 'AWS SageMaker'). "
            "Combine similar skills (e.g., 'ML' and 'Machine Learning' as 'Machine Learning'). "
            "Return [] if no skills are found or input is empty. "
            "Output must be a valid JSON array without ```json, backticks, or any other formatting. "
            "Example: Input: 'Skills: Python, ML\nProjects: Built a model using TensorFlow' "
            "Output: [\"Python\", \"Machine Learning\", \"TensorFlow\"]"
            "\n\nCV Text:\n" + cv_text
        )
        response = model.generate_content(prompt)
        raw_response = response.text.strip()
        logging.info(f"Gemini raw response: {raw_response}")
        cleaned_response = raw_response
        if cleaned_response.startswith("```json\n"):
            cleaned_response = cleaned_response[8:].strip()
        elif cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:].strip()
        if cleaned_response.endswith("\n```"):
            cleaned_response = cleaned_response[:-4].strip()
        elif cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3].strip()
        logging.info(f"Cleaned Gemini response: {cleaned_response}")
        try:
            skills = json.loads(cleaned_response)
            if not isinstance(skills, list):
                logging.error("Gemini response is not a JSON array")
                return []
            logging.info(f"Extracted skills: {skills}")
            return skills
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse cleaned Gemini response as JSON: {cleaned_response}, Error: {str(e)}")
            return []
    except Exception as e:
        logging.error(f"Error in Gemini skill extraction: {str(e)}")
        return []

# Cell 7: MongoDB Query
def get_required_skills(job_collection, job_role):
    try:
        job_doc = job_collection.find_one({"Job_Role": job_role})
        if not job_doc:
            raise ValueError(f"Job role '{job_role}' not found in database")
        required_skills = [skill.strip() for skill in job_doc["Required_Skills"].split(",")]
        logging.info(f"Required skills for {job_role}: {required_skills}")
        return required_skills
    except Exception as e:
        logging.error(f"Error retrieving required skills: {str(e)}")
        raise ValueError(f"Error retrieving required skills: {str(e)}")

# Cell 8: Skill Gap Analysis with Gemini Semantic Similarity
def skill_gap_analysis(cv_skills, required_skills):
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = (
            "You are an expert in skill gap analysis. Given two lists of skills: "
            "1. CV skills (from a candidate's CV). "
            "2. Required skills (for a job role). "
            "Identify which required skills are missing from the CV skills, considering semantic similarity and synonyms. "
            "Treat skills as equivalent if they have the same or similar meaning (e.g., 'ML' ≈ 'Machine Learning', "
            "'SQL' ≈ 'Database Management', 'Statistical Analysis' ≈ 'Statistics', 'Deep Learning' ≈ 'Neural Networks'). "
            "Return a plain JSON array of the missing required skills, preserving their original names from the required skills list. "
            "Output must be a valid JSON array without ```json or backticks. "
            "Example: "
            "CV skills: ['Python', 'SQL', 'Deep Learning'] "
            "Required skills: ['Database Management', 'Neural Networks', 'Data Pipelines'] "
            "Output: ['Data Pipelines'] "
            "\n\nCV Skills:\n" + json.dumps(cv_skills) +
            "\nRequired Skills:\n" + json.dumps(required_skills)
        )
        response = model.generate_content(prompt)
        raw_response = response.text.strip()
        logging.info(f"Gemini raw response for skill gap: {raw_response}")
        cleaned_response = raw_response
        if cleaned_response.startswith("```json\n"):
            cleaned_response = cleaned_response[8:].strip()
        elif cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:].strip()
        if cleaned_response.endswith("\n```"):
            cleaned_response = cleaned_response[:-4].strip()
        elif cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3].strip()
        logging.info(f"Cleaned Gemini response for skill gap: {cleaned_response}")
        try:
            missing_skills = json.loads(cleaned_response)
            if not isinstance(missing_skills, list):
                logging.error("Gemini response is not a JSON array")
                return []
            logging.info(f"Missing skills: {missing_skills}")
            return missing_skills
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse Gemini response: {cleaned_response}, Error: {str(e)}")
            return []
    except Exception as e:
        logging.error(f"Error in skill gap analysis: {str(e)}")
        return []

# Cell 9: Functions for Experience, Skill Weighting, and Ranking
def extract_experience(cv_text):
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = (
            "You are an expert in CV analysis. Extract the total years of professional experience from the following CV text by analyzing the 'Experience' or 'Work Experience' section. "
            "Identify all job entries and their date ranges (e.g., 'March 2016 - June 2018'). "
            "For each job, calculate the duration in months: (end_year - start_year)*12 + (end_month - start_month). "
            "If the end date is 'Present', use April 2025 as the end date. "
            "If a date is ambiguous (e.g., only year provided), assume January for start month and December for end month. "
            "If a job has no clear date range, estimate 1 year if it seems like a short-term role (e.g., intern), else skip it. "
            "Sum the durations of all jobs, convert to years (divide by 12, round to nearest integer), and return a single integer. "
            "If no experience or dates are found, return 0. "
            "Output only the integer, nothing else. "
            "Example: "
            "Input: 'Data Scientist, March 2016 - June 2018\nAnalyst, April 2015 - March 2016' "
            "Calculation: (2018-2016)*12 + (6-3) = 27 months; (2016-2015)*12 + (3-4) = 11 months; Total = 38/12 ≈ 3 years "
            "Output: 3 "
            "\n\nCV Text:\n" + cv_text
        )
        response = model.generate_content(prompt)
        years = int(response.text.strip())
        logging.info(f"Gemini extracted experience: {years} years")
        return years
    except Exception as e:
        logging.error(f"Error extracting experience: {str(e)}")
        return 0

def assign_experience_weight(years):
    if years > 15:
        return 3  # High
    elif 8 <= years <= 15:
        return 2  # Medium
    elif 1 <= years <= 7:
        return 1  # Low
    return 0  # None or <1 year

def assign_skill_weights(job_role, required_skills):
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel("gemini-1.5-flash")
        num_skills = len(required_skills)
        prompt = (
            f"You are an expert in recruitment for the job role '{job_role}'. "
            f"Given the following required skills: {json.dumps(required_skills)}, "
            f"assign a weight to each skill based on its relevance to the job role. "
            f"Use integers from 1 (least relevant) to {num_skills} (most relevant), ensuring each skill gets a unique weight. "
            "Consider technical skills (e.g., 'Python' for Data Scientist) as more relevant than soft or unrelated skills (e.g., 'Hiking'). "
            "Return a JSON object mapping each skill to its weight. "
            "Output must be a valid JSON object without ```json or backticks. "
            "Example: "
            "Job role: Data Scientist "
            "Required skills: ['Python', 'Report Writing', 'Hiking'] "
            "Output: {\"Python\": 3, \"Report Writing\": 2, \"Hiking\": 1}"
            "\n\nRequired Skills:\n" + json.dumps(required_skills)
        )
        response = model.generate_content(prompt)
        raw_response = response.text.strip()
        logging.info(f"Gemini raw response for skill weights: {raw_response}")
        cleaned_response = raw_response
        if cleaned_response.startswith("```json\n"):
            cleaned_response = cleaned_response[8:].strip()
        elif cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:].strip()
        if cleaned_response.endswith("\n```"):
            cleaned_response = cleaned_response[:-4].strip()
        elif cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3].strip()
        logging.info(f"Cleaned Gemini response for skill weights: {cleaned_response}")
        try:
            weights = json.loads(cleaned_response)
            if not isinstance(weights, dict):
                logging.error("Gemini response is not a JSON object")
                return {skill: 1 for skill in required_skills}  # Fallback: equal weights
            logging.info(f"Skill weights: {weights}")
            return weights
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse Gemini response: {cleaned_response}, Error: {str(e)}")
            return {skill: 1 for skill in required_skills}
    except Exception as e:
        logging.error(f"Error assigning skill weights: {str(e)}")
        return {skill: 1 for skill in required_skills}

def rank_cvs(cv_folder, job_role):
    try:
        job_collection = connect_to_mongodb()
        required_skills = get_required_skills(job_collection, job_role)
        skill_weights = assign_skill_weights(job_role, required_skills)
        cv_results = []

        for cv_file in os.listdir(cv_folder):
            if cv_file.lower().endswith('.pdf'):
                cv_path = os.path.join(cv_folder, cv_file)
                try:
                    cv_text = extract_cv_text(cv_path)
                    skills_text = extract_skills_section(cv_text)
                    cv_skills = extract_skills_with_gemini(skills_text)
                    if not cv_skills:
                        logging.warning(f"No skills extracted from {cv_file}")
                        continue

                    years = extract_experience(cv_text)
                    exp_weight = assign_experience_weight(years)
                    missing_skills = skill_gap_analysis(cv_skills, required_skills)
                    skill_score = sum(skill_weights.get(skill, 0) for skill in cv_skills if skill in required_skills)
                    total_score = exp_weight + skill_score

                    cv_results.append({
                        "cv_file": cv_file,
                        "years_experience": years,
                        "experience_weight": exp_weight,
                        "cv_skills": cv_skills,
                        "required_skills": required_skills,
                        "missing_skills": missing_skills,
                        "skill_score": skill_score,
                        "total_score": total_score
                    })
                    logging.info(f"Processed {cv_file}: Score = {total_score}")
                except Exception as e:
                    logging.error(f"Error processing {cv_file}: {str(e)}")
                    continue

        ranked_cvs = sorted(cv_results, key=lambda x: x["total_score"], reverse=True)
        logging.info(f"Ranked {len(ranked_cvs)} CVs")
        return ranked_cvs
    except Exception as e:
        logging.error(f"Error in CV ranking: {str(e)}")
        return []



In [8]:
# Cell 10: Run CV Ranking
cv_folder = r"E:\Individual Projects\Skill_gap_system\testcvs"  # Adjust to your CV folder
job_role = "Data Scientist"
ranked_cvs = rank_cvs(cv_folder, job_role)
print("Ranked CVs:")
for i, cv in enumerate(ranked_cvs, 1):
    print(f"Rank {i}: {cv['cv_file']}")
    print(f"  Years of Experience: {cv['years_experience']} (Weight: {cv['experience_weight']})")
    print(f"  CV Skills: {cv['cv_skills']}")
    print(f"  Missing Skills: {cv['missing_skills']}")
    print(f"  Skill Score: {cv['skill_score']}")
    print(f"  Total Score: {cv['total_score']}")
    print()

2025-04-17 21:39:47,848 - INFO - Connected to MongoDB Atlas
2025-04-17 21:39:48,155 - INFO - Required skills for Data Scientist: ['ETL', 'Excel', 'A/B Testing', 'Data Visualization', 'Big Data']
2025-04-17 21:39:49,383 - INFO - Gemini raw response for skill weights: {
  "Big Data": 5,
  "ETL": 4,
  "Data Visualization": 3,
  "A/B Testing": 2,
  "Excel": 1
}
2025-04-17 21:39:49,384 - INFO - Cleaned Gemini response for skill weights: {
  "Big Data": 5,
  "ETL": 4,
  "Data Visualization": 3,
  "A/B Testing": 2,
  "Excel": 1
}
2025-04-17 21:39:49,385 - INFO - Skill weights: {'Big Data': 5, 'ETL': 4, 'Data Visualization': 3, 'A/B Testing': 2, 'Excel': 1}
2025-04-17 21:39:49,662 - INFO - Extracted text from page 1 using pdfplumber
2025-04-17 21:39:49,663 - INFO - Extracted text from CV: K A N DA C E L O U D O R
DATA SCIENTIST
CONTACT WORK EXPERIENCE
kloudor@email.com Data Scientist
(12...
2025-04-17 21:39:49,665 - INFO - Extracted skills section: Spectrix Analytical Services March 2016 - Jun

Ranked CVs:
Rank 1: data-scientist-resume-example.pdf
  Years of Experience: 4 (Weight: 1)
  CV Skills: ['Python', 'NumPy', 'Pandas', 'Scikit-learn', 'Keras', 'Flask', 'SQL', 'MySQL', 'Postgres', 'Git', 'Time Series Forecasting', 'Productionizing Models', 'Recommendation Engines', 'AWS', 'Tableau', 'Cohort Analysis', 'A/B Testing', 'Pricing Experiments', 'Customer Lifetime Value', 'Data Analysis', 'Random Forest', 'Customer Segmentation']
  Missing Skills: ['ETL', 'Excel', 'Data Visualization', 'Big Data']
  Skill Score: 2
  Total Score: 3

Rank 2: Shashiprabha_Senanayake_Resume (3).pdf
  Years of Experience: 1 (Weight: 1)
  CV Skills: ['LSTM', 'Streamlit', 'Wix', 'Python', 'scikit-learn', 'TensorFlow', 'OpenCV', 'Machine Learning', 'Deep Learning', 'Power BI', 'R', 'R Shiny', 'SPSS', 'Excel', 'NLTK', 'Natural Language Toolkit', 'Data Analysis', 'Statistics', 'Mathematics', 'Computer Science', 'IT']
  Missing Skills: ['ETL', 'A/B Testing', 'Data Visualization', 'Big Data']
  Skill Scor