In [1]:

from pymongo.mongo_client import MongoClient
from dotenv import load_dotenv
import os

load_dotenv()


uri = os.getenv("MONGODB_URI")
# Create a new client and connect to the server
client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)


Pinged your deployment. You successfully connected to MongoDB!


In [2]:
# from db import db
from bson.objectid import ObjectId
db = client["ResumeParser"]
resumes = db["resumes"]

def insert_resume(email, resume_data):
    return resumes.insert_one({"email": email, "resume": resume_data}).inserted_id

def get_resumes_by_email(email):
    return list(resumes.find({"email": email}))

def get_resume_by_id(resume_id):
    return resumes.find_one({"_id": ObjectId(resume_id)})


In [12]:
insert_resume("praveentak715@gmail.com", "account123")

ObjectId('68734ea3aa1b389788fec498')

In [3]:

import bcrypt

users = db["users"]

def register_user(email, password):
    if users.find_one({"email": email}):
        return False, "Email already exists"
    hashed = bcrypt.hashpw(password.encode(), bcrypt.gensalt())
    users.insert_one({"email": email, "password": hashed})
    return True, "User registered"

def validate_login(email, password):
    user = users.find_one({"email": email})
    if user and bcrypt.checkpw(password.encode(), user["password"]):
        return True
    return False

def get_user_by_email(email):
    if users.find_one({"email": email}):
        return True, "Email already exists"
    else:
        return False

In [4]:
get_user_by_email("praveentak715@gmail.com")

(True, 'Email already exists')

In [5]:
register_user("praveentak715@gmail.com", "account123")

(False, 'Email already exists')

In [6]:
validate_login("praveentak715@gmail.com", "account123")

True

In [10]:
import bcrypt

def hash_password(password):
    return bcrypt.hashpw(password.encode(), bcrypt.gensalt())

def verify_password(password, hashed):
    return bcrypt.checkpw(password.encode(), hashed)

In [None]:
import pandas as pd
import csv
import os
import docx2txt
import PyPDF2
import json
import re
from nltk.tokenize import blankline_tokenize
import json

class ResumeParser:
    def __init__(self, file):
        self.SECTION_KEYWORDS = {
            "personal_info": [
                "Personal Information", "Contact Information", "Contact Details", "Profile", "About Me", "Bio"
            ],
            "objective": [
                "Objective", "Career Objective", "Professional Summary", "Summary", "Profile Summary", "Career Summary", "Personal Statement", "Executive Summary"
            ],
            "skills": [
                "Skills", "Key Skills", "Technical Skills", "Core Competencies", "Competencies", "Areas of Expertise", "Technical Proficiencies"
            ],
            "experience": [
                "Work Experience", "Professional Experience", "Employment History", "Experience", "Relevant Experience", "Work History", "Career History"
            ],
            "education": [
                "Education", "Educational Qualifications", "Academic Background", "Academic History", "Academic Qualifications", "Educational Background"
            ],
            "certifications": [
                "Certifications", "Certificates", "Professional Certifications", "Licenses", "Training"
            ],
            "projects": [
                "Projects", "Key Projects", "Project Experience", "Significant Projects", "Relevant Projects"
            ],
            "achievements": [
                "Achievements", "Accomplishments", "Key Achievements", "Awards and Honors", "Honors", "Recognitions"
            ],
            "languages": [
                "Languages", "Language Proficiency", "Language Skills"
            ],
            "interests": [
                "Interests", "Hobbies", "Personal Interests", "Extra-Curricular Activities"
            ],
            "references": [
                "References", "Referees"
            ],
            "publications": [
                "Publications", "Research Publications", "Papers", "Articles"
            ],
            "volunteer": [
                "Volunteer Experience", "Volunteering", "Community Involvement", "Social Service"
            ],
            "additional": [
                "Additional Information", "Other Information", "Miscellaneous"
            ],
            "development": [
                "Professional Development", "Continuing Education", "Workshops", "Seminars", "Conferences"
            ]
        }

        self.file = file

    def data_ingestion(self):
        # Check the file extension to determine how to read the file
        _, file_extension = os.path.splitext(self.file)

        if file_extension == '.txt':
            with open(self.file, "r", encoding="utf-8") as file:
                content = file.read()
        
        elif file_extension == '.docx':
            content = docx2txt.process(self.file)
        
        elif file_extension == '.pdf':
            content = ""
            with open(self.file, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                for page in reader.pages:
                    content += page.extract_text() + "\n"
        
        else:
            raise ValueError("Unsupported file format: {}".format(file_extension))

        return content



    def preprocess(self):
        content = self.data_ingestion()
        tokenized = blankline_tokenize(content)
        return tokenized

    def extract_resume_info(self):
       text = self.preprocess()
       # Join the tokenized content into a single string
       text = "\n".join(text)
       lines = [line.strip() for line in text.splitlines() if line.strip()]
       
       # ---------- 1. Extract Phone (10-digit, Indian style) ----------
       phone = re.findall(r'\b[6-9]\d{9}\b', text)
       phone = phone[0] if phone else None

       # ---------- 2. Extract Email ----------
       email = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
       email = email[0] if email else None

       # ---------- 3. Extract Links ----------
       raw_urls = re.findall(r'https?://[^\s)>\]]+', text)
       markdown_urls = re.findall(r'\[.*?\]\((https?://[^\s)]+)\)', text)
       html_urls = re.findall(r'href="(https?://[^\s"]+)"', text)
       links = list(set(raw_urls + markdown_urls + html_urls))

       # ---------- 4. Extract Name ----------
       # Assume name is first non-empty line and contains no @ or digits
       name = None
       for line in lines[:5]:
           if not any(char.isdigit() for char in line) and '@' not in line and len(line.split()) <= 5:
               name = line
               break

       return {
           "name": name,
           "phone": phone,
           "email": email,
           "links": links,
       }
   

    def section_identification(self):
        document = self.data_ingestion()
        tokenized = self.preprocess()

        sections = {}
        sections_list = []
        sections_index = {}
        previous_section = None

        resume_info = self.extract_resume_info()
        sections.update(resume_info)
            
        for num in range(len(tokenized)):
            section_name = tokenized[num].split("\n")[0].title()
            element = tokenized[num].split("\n")[0].strip().title()
            sections_list.append(element)
            for section, keywords in self.SECTION_KEYWORDS.items():
                if any(keyword in section_name for keyword in keywords):
                    section_name = section_name.strip()
                    sections_index[section_name] = sections_list.index(section_name)
                    current_section = section
                    sections[current_section] = []
                    if previous_section:
                        sections[ps] = (tokenized[sections_index[previous_section]:sections_index[section_name]])
                        previous_section = section_name
                        ps = current_section
                    else:
                        previous_section = section_name
                        ps = current_section

            for section, details in sections.items():
                sections[section] = str("".join(details))   

        return sections


    def csv_format(self):
        json_data = self.section_identification()
        
        # Convert JSON data to a dictionary if it's a JSON string
        if isinstance(json_data, str):
            json_data = json.loads(json_data)

        # Define the uploads directory
        uploads_dir = "uploads"
        os.makedirs(uploads_dir, exist_ok=True)  # Create the directory if it doesn't exist

        # Define the CSV file path
        csv_file_path = os.path.join(uploads_dir, "resume_data.csv")
        
        # Open the CSV file for writing
        with open(csv_file_path, "w", encoding="utf-8", newline="") as f:
            writer = csv.writer(f)
            
            # Write the header
            writer.writerow(["Section", "Content"])

            # Iterate through the JSON data and write to CSV
            for section, content in json_data.items():
                if isinstance(content, list):
                    for item in content:
                        writer.writerow([section, item])
                else:
                    writer.writerow([section, str(content)])

        # Simulate file download (in a real application, you would return the file)
        # For example, in a Flask app, you would use send_file(csv_file_path)

        # After the file is downloaded, delete it
        # os.remove(csv_file_path)


    def json_format(self):
        raw_text = self.section_identification()
        
        # Define the uploads directory
        uploads_dir = "uploads"
        os.makedirs(uploads_dir, exist_ok=True)  # Create the directory if it doesn't exist

        # Define the JSON file path
        json_file_path = os.path.join(uploads_dir, "resume_data.json")
        
        # Write the JSON data to the file
        with open(json_file_path, "w", encoding="utf-8") as f:
            json.dump(raw_text, f, indent=4, ensure_ascii=False)

        # Read the JSON data back from the file
        with open(json_file_path, "r", encoding="utf-8") as f:
            content = json.load(f)

        return content


    def excel_format(self):
        raw_text = self.section_identification()
        rows = []
        
        # Prepare the data for the DataFrame
        for section, items in raw_text.items():
            if isinstance(items, list):
                for item in items:
                    rows.append({"Section": section, "Content": item})
            else:
                rows.append({"Section": section, "Content": str(items)})

        # Define the uploads directory
        uploads_dir = "uploads"
        os.makedirs(uploads_dir, exist_ok=True)  # Create the directory if it doesn't exist

        # Define the Excel file path
        excel_file_path = os.path.join(uploads_dir, "resume_data.xlsx")
        
        # Create a DataFrame and save it to an Excel file
        df = pd.DataFrame(rows)
        df.to_excel(excel_file_path, index=False)

        # Read the Excel file back (if needed)
        # You can use pd.read_excel() if you want to read it back into a DataFrame
        # df_read = pd.read_excel(excel_file_path)

        # Delete the Excel file after reading
        os.remove(excel_file_path)

        return df  # Return the DataFrame if needed



In [None]:
def insert_resume(email, resume_data):
    inserted = resumes.insert_one({"email": email, "resume": resume_data})
    return "Inserted ID:", inserted.inserted_id

def get_format_resume(id, email, format):
    if format == "json":
        extracted = resumes.find_one({ "$and" : [{ "id": id }, { "email": email }]}, {"json": 1})
    elif format == "csv":
        extracted = resumes.find_one({ "$and" : [{ "id": id }, { "email": email }]}, {"csv": 1})
    elif format == "json":
        extracted = resumes.find_one({ "$and" : [{ "id": id }, { "email": email }]}, {"excel": 1})
    else:
        return "No data matched"
    return extracted

In [30]:
import uuid

rp = ResumeParser("Resume.docx") 
email = "praveentak715@gmail.com"

id = "1245"
fjson = rp.json_format()
fcsv = rp.csv_format()
fexcel = rp.excel_format()

In [None]:
fcsv

<Response 0 bytes [200 OK]>

In [7]:
insert_resume(id, email, "Resume.docx", fjson, fcsv, fexcel)

Inserted ID: 687fc588438f205d7e05fc4e
687fc588438f205d7e05fc4e,
id=1245,
email=praveentak715@gmail.com,
resume data=Resume.docx,
{'name': 'Praveen Tak', 'phone': '9462096002', 'email': 'praveentak715@gmail.com', 'links': '', 'personal_info': 'PROFILE', 'experience': 'WORK EXPERIENCEFeb 2025 – Apr 2025\tProject Trainee – Defence Research and Development Organisation (DRDO)\t\t\t\t\t Jodhpur, India\nDomain: Machine Learning | Computer Vision | Deep LearningKey Responsibilities & Learnings:Worked on the project “Painting Generation using CycleGANs” exploring advanced concepts in image-to-image translation using Generative Adversarial Networks.Implemented neural network models using Python, TensorFlow/PyTorch, and other ML libraries.Performed data preprocessing, augmentation, and evaluation of generated outputs.', 'projects': 'PROJECTSJan 2025  Feb 2025\tMushroom Classification — Machine Learning', 'skills': 'Aug 2023  Jan 2025\tFull Stack Data Science Pro — PW SkillsOct 2024\tCodemathon 2

In [44]:
get_format_resume(id, email, "csv")

{'_id': ObjectId('687faf83a4033d534d855263'), 'csv': None}