In [1]:
import os
import requests
import pymongo
import re

# MongoDB connection
mongo_client = pymongo.MongoClient("mongodb+srv://leojones:IjwiPFyqnqHJfU1A@cluster0.ooabn.mongodb.net/")
db = mongo_client["Resume"]  # Replace with your database name
collection = db["link"]  # Replace with your collection name

# Function to download the PDF file from Google Drive link
def download_drive_file(file_id, filename):
    """Downloads a file from Google Drive given its file ID."""
    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
    response = requests.get(download_url, stream=True)

    if response.status_code == 200:
        with open(filename, "wb") as f:
            for chunk in response.iter_content(1024):  # Iterating through chunks of data
                f.write(chunk)  # Write each chunk to the file
        print(f"File {filename} downloaded successfully!")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

# Create a directory to save downloaded resumes (if it doesn't exist)
output_dir = "downloaded_resumes"
os.makedirs(output_dir, exist_ok=True)

# Fetch documents from MongoDB
documents = collection.find({}, {"Attach Your CV": 1, "Name": 1})  # Fetch the "Attach Your CV" and "Name" fields

# Loop through the MongoDB documents and download the resumes
for index, doc in enumerate(documents, start=1):
    file_link = doc.get("Attach Your CV")  # Get the Google Drive file link
    name = doc.get("Name")  # Get the name field to be used for the filename
    
    if file_link and name:
        # Extract file ID from the Google Drive link
        file_id = file_link.split("id=")[-1]  # Assuming the URL contains "id=<file_id>"
        
        # Clean the name to be used as the filename (replace spaces with underscores, remove special characters)
        filename = f"{re.sub(r'[^a-zA-Z0-9_-]', '_', name)}.pdf"
        file_path = os.path.join(output_dir, filename)
        
        # Download the file
        download_drive_file(file_id, file_path)
import os
import re
import logging
import pdfplumber
from docx import Document

# Setup logging for error tracking
logging.basicConfig(filename="resume_parser_errors.log", level=logging.ERROR)

# Define internship roles
internshipRoles = {
    "GlobalBusinessAnalyst": {
        "jobTitle": "Global Business Analyst",
        "education": ["MBA"],
        "skills": [
            "business analysis", "data interpretation", 
            "financial modeling", "market strategy", 
            "excel", "power bi"]
    },
    "MBAIntern": {
        "jobTitle": "Global Business Developer Intern",
        "education": ["MBA"],
        "skills": ["market research", "sales", "business development", "excel", "google analytics"]
    },
    "CyberSecurity": {
        "jobTitle": "Junior Pentester",
        "education": ["B.Sc", "CSE", "BCA", "M.Sc", "MCA"],
        "skills": ["penetration testing", "ethical hacking", "networking security", "python", "bash"]
    },
    "SoftwareDevelopment": {
        "jobTitle": "Software Developer Intern",
        "education": ["B.Sc", "CSE", "BCA", "M.Sc", "MCA"],
        "skills": ["javascript", "react", "angular", "git", "c++", "java", "python"]
    },
    "AIandML": {
        "jobTitle": "AI/ML Intern",
        "education": ["B.Sc", "CSE", "BCA", "M.Sc", "MCA"],
        "skills": ["python", "machine learning", "tensorflow", "pytorch", "data preprocessing"]
    }
}

# Function to extract text from resumes
def extract_text(file_path):
    try:
        if file_path.endswith('.pdf'):
            with pdfplumber.open(file_path) as pdf:
                text = "\n".join([page.extract_text() or '' for page in pdf.pages])
            return text.strip()
        elif file_path.endswith('.docx'):
            doc = Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs]).strip()
        else:
            logging.warning(f"Unsupported file type: {file_path}")
            return ""
    except Exception as e:
        logging.error(f"Error extracting text from {file_path}: {e}")
        return ""

# Function to process resumes in a folder
def process_resumes(folder_path):
    resumes = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        resume_text = extract_text(file_path)

        if resume_text.strip():
            resumes.append({"file_name": file_name, "content": resume_text})
    return resumes

# Function to parse resume content
def parse_resume_content(resume_content):
    parsed_info = {
        "name": "Not Found",
        "email": "Not Found",
        "skills": [],
        "college": "Not Found",
        "education": []
    }

    def extract_name(text):
        # Pattern 1: Labeled names (e.g., "Name: John Doe")
        name_label_matches = re.search(
            r"(?:Name|Full Name|Candidate Name|Applicant Name):?\s*(?:Mr\.|Ms\.|Mrs\.|Dr\.)?\s*([A-Z][a-zA-Z.' -]+(?:\s[A-Z][a-zA-Z.' -]+)*)",
            text, re.MULTILINE | re.IGNORECASE
        )
        if name_label_matches:
            return name_label_matches.group(1).strip()
        
        # Pattern 2: Names in the first few lines (without labels)
        lines = text.splitlines()[:15]  # Check the first 15 lines for names
        possible_names = []
        for line in lines:
            line = line.strip()
            # Match names that start with a capital letter, allowing for multi-part names, hyphens, and titles
            if re.match(r"^(?:Mr\.|Ms\.|Mrs\.|Dr\.)?\s*[A-Z][a-zA-Z.' -]+(?:\s[A-Z][a-zA-Z.' -]+)*$", line, re.IGNORECASE):
                possible_names.append(line)
        if possible_names:
            return possible_names[0].strip()
        
        # Pattern 3: General name extraction from the entire text
        general_name_matches = re.findall(
            r"\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)?\s*[A-Z][a-zA-Z.' -]+(?:\s[A-Z][a-zA-Z.' -]+)*\b", text
        )
        if general_name_matches:
            return general_name_matches[0].strip()
        
        # Pattern 4: Fallback to any two or three consecutive capitalized words
        fallback_matches = re.findall(
            r"\b[A-Z][a-zA-Z.' -]+\s[A-Z][a-zA-Z.' -]+(?:\s[A-Z][a-zA-Z.' -]+)?\b", text
        )
        if fallback_matches:
            return fallback_matches[0].strip()
        
        return "Not Found"

    def extract_email(text):
        email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
        return email_match.group(0) if email_match else "Not Found"

    def extract_skills(text):
        skills_keywords = {skill.lower() for role in internshipRoles.values() for skill in role["skills"]}
        found_skills = [skill for skill in skills_keywords if skill in text.lower()]
        return list(set(found_skills))  # Return unique skills

    def extract_college(text):
        college_match = re.search(
            r"(?:Studied at|Graduated from|University of|College of|Education)\s*([A-Za-z .'-]+)",
            text, re.IGNORECASE
        )
        return college_match.group(1).strip() if college_match else "Not Found"

    def extract_education(text):
        education_matches = re.findall(
            r"\b(MBA|B\.Sc|M\.Sc|BCA|MCA|CSE|B\.Tech|M\.Tech|Ph\.D|Bachelor|Master)\b", 
            text, re.IGNORECASE
        )
        return list(set(education_matches))

    # Call extraction functions
    parsed_info["name"] = extract_name(resume_content)
    parsed_info["email"] = extract_email(resume_content)
    parsed_info["skills"] = extract_skills(resume_content)
    parsed_info["college"] = extract_college(resume_content)
    parsed_info["education"] = extract_education(resume_content)

    return parsed_info

# Function to match resume to internship roles
def match_internship_role(parsed_info):
    matched_roles = []
    for role, details in internshipRoles.items():
        skills_score = len(set(details["skills"]).intersection(set(parsed_info["skills"])))
        education_score = 2 if any(edu in parsed_info["education"] for edu in details["education"]) else 0
        total_score = skills_score + education_score

        if total_score > 0:
            matched_roles.append((details["jobTitle"], total_score))

    matched_roles.sort(key=lambda x: x[1], reverse=True)
    return matched_roles[:2] if matched_roles else [("No suitable match", 0)]

# Function to calculate process accuracy (heuristic-based)
def calculate_process_accuracy(parsed_info):
    # Weights for each field (sum should be 1)
    weights = {
        "name": 0.3,
        "email": 0.3,
        "skills": 0.2,
        "education": 0.2
    }
    
    # Heuristic validation for each field
    valid_name = 1 if parsed_info["name"] != "Not Found" else 0
    valid_email = 1 if re.match(r"[^@]+@[^@]+\.[^@]+", parsed_info["email"]) else 0
    valid_skills = 1 if len(parsed_info["skills"]) > 0 else 0
    valid_education = 1 if len(parsed_info["education"]) > 0 else 0
    
    # Calculate weighted accuracy
    accuracy = (
        valid_name * weights["name"] +
        valid_email * weights["email"] +
        valid_skills * weights["skills"] +
        valid_education * weights["education"]
    )
    return accuracy * 100  # Convert to percentage

# Main function to run the parser and calculate process accuracy
def main(folder_path):
    resumes = process_resumes(folder_path)
    process_accuracies = []

    for resume in resumes:
        parsed_info = parse_resume_content(resume["content"])
        matched_roles = match_internship_role(parsed_info)
        
        # Calculate process accuracy for this resume
        accuracy = calculate_process_accuracy(parsed_info)
        process_accuracies.append(accuracy)

        print(f"\nResume: {resume['file_name']}")
        print(f"Name: {parsed_info['name']}")
        print(f"Email: {parsed_info['email']}")
        print(f"Skills: {', '.join(parsed_info['skills'])}")
        print(f"College: {parsed_info['college']}")
        print(f"Education: {', '.join(parsed_info['education'])}")
        print("Matched Roles:")
        for role, score in matched_roles:
            print(f" - {role} (Score: {score})")
        print(f"Process Accuracy: {accuracy:.2f}%")

    # Calculate overall process accuracy
    overall_accuracy = sum(process_accuracies) / len(process_accuracies) if process_accuracies else 0
    print(f"\nOverall Process Accuracy: {overall_accuracy:.2f}%")

# Run the parser if the script is executed
if __name__ == "__main__":
    folder_path = "downloaded_resumes"  # Replace with the actual folder path
    main(folder_path)

File downloaded_resumes/A_Viswa_.pdf downloaded successfully!
File downloaded_resumes/VIGNASH_M_.pdf downloaded successfully!
File downloaded_resumes/Gopi_Santhosh_.pdf downloaded successfully!
File downloaded_resumes/Rajadurai_k.pdf downloaded successfully!
File downloaded_resumes/Mohamed_Farwas_.pdf downloaded successfully!
File downloaded_resumes/SURUTHI_S.pdf downloaded successfully!
File downloaded_resumes/SURYAGAYATRI_M_R.pdf downloaded successfully!
File downloaded_resumes/Manimegalai_T_.pdf downloaded successfully!
File downloaded_resumes/Vidhiyasri_S.pdf downloaded successfully!
File downloaded_resumes/Bavadarani_M.pdf downloaded successfully!
File downloaded_resumes/Hanlala_Ibrahim_M_G_.pdf downloaded successfully!
File downloaded_resumes/Mohamed_Yasar.pdf downloaded successfully!
File downloaded_resumes/Darshan_R.pdf downloaded successfully!
File downloaded_resumes/Arunkumar_.pdf downloaded successfully!
File downloaded_resumes/SAHANA_L.pdf downloaded successfully!
File down