<a href="https://colab.research.google.com/github/Prince125047/college-project/blob/main/resume_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf docx2txt pandas

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3960 sha256=a600ad0093277ce38558145e81ad84225b4b894253eac63a46dd1a8df05d8597
  Stored in directory: /root/.cache/pip/wheels/0f/0e/7a/3094a4ceefe657bff7e12dd9592a9d5b6487ef4338ace0afa6
Successfully built docx2txt
Installing collected packages: docx2txt, pymupdf
Successfully installed docx2txt-0.8 pymupdf-1.25.3


In [5]:
import re
import fitz  # PyMuPDF for PDFs
import docx2txt  # For DOCX
import spacy
import pandas as pd
import json
import string

# Load pre-trained NLP model (spaCy)
nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text

# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
    return docx2txt.process(docx_path)

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = " ".join(text.split())  # Remove extra spaces
    return text

# Function to extract technical skills dynamically
def extract_technical_skills(text):
    tech_skills = {
        "python", "java", "c++", "javascript", "sql", "html", "css", "react", "node.js",
        "machine learning", "deep learning", "tensorflow", "pytorch", "nlp", "opencv",
        "big data", "hadoop", "spark", "kafka", "data science", "cloud computing",
        "aws", "azure", "gcp", "cybersecurity", "penetration testing", "blockchain",
        "docker", "kubernetes", "ci/cd", "git", "devops", "linux", "bash scripting",
        "flask", "django", "fastapi", "ruby", "scala", "go", "rust", "typescript",
        "graphql", "postgresql", "mongodb", "firebase", "elasticsearch", "redis",
        "unity", "unreal engine", "computer vision", "llms", "prompt engineering"
    }
    extracted_skills = set()
    doc = nlp(text)
    for token in doc:
        if token.text.lower() in tech_skills:
            extracted_skills.add(token.text.lower())
    # Use NLP entity recognition for additional skills
    additional_skills = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]]
    extracted_skills.update(additional_skills)
    return list(extracted_skills)

# Function to extract interpersonal skills
def extract_interpersonal_skills(text):
    soft_skills = {
        "communication", "teamwork", "leadership", "problem-solving",
        "adaptability", "creativity", "collaboration", "empathy",
        "time management", "critical thinking", "negotiation", "work ethic",
        "decision-making", "conflict resolution", "attention to detail",
        "emotional intelligence", "resilience", "public speaking", "active listening",
        "persuasion", "networking", "mentoring", "stress management"
    }
    extracted_skills = set()
    doc = nlp(text)
    for token in doc:
        if token.text.lower() in soft_skills:
            extracted_skills.add(token.text.lower())
    return list(extracted_skills)

# Function to extract insights from projects and experience
def extract_project_insights(text):
    keywords = {
        "developed", "built", "created", "designed", "implemented",
        "analyzed", "optimized", "deployed", "engineered", "researched",
        "integrated", "evaluated", "refactored", "debugged", "automated",
        "tested", "configured", "launched", "prototyped", "constructed",
        "innovated", "streamlined", "refactored", "mentored", "led",
        "collaborated", "architected", "initiated", "pioneered"
    }
    insights = []
    sentences = text.split(". ")
    for sentence in sentences:
        for keyword in keywords:
            if keyword in sentence.lower():
                insights.append(sentence)
                break
    return insights

# Function to parse resume
def parse_resume(file_path, file_type="pdf"):
    text = extract_text_from_pdf(file_path) if file_type == "pdf" else extract_text_from_docx(file_path)
    text = clean_text(text)  # Clean text for better processing
    parsed_data = {
        "technical_skills": extract_technical_skills(text),
        "interpersonal_skills": extract_interpersonal_skills(text),
        "project_insights": extract_project_insights(text),
        "cleaned_text": text  # Fully cleaned text for Word2Vec model
    }

    # Convert to DataFrame
    df = pd.DataFrame([parsed_data])

    # Convert to JSON
    json_data = json.dumps(parsed_data, indent=4)

    return df, json_data

# Example Usage
df_resume, json_resume = parse_resume("/content/My Resume (1).pdf", "pdf")
print(df_resume)
print(json_resume)




                       technical_skills interpersonal_skills  \
0  [python, css, javascript, sql, java]                   []   

                                    project_insights  \
0  [rohit kumar software developer eager to solve...   

                                        cleaned_text  
0  rohit kumar software developer eager to solve ...  
{
    "technical_skills": [
        "python",
        "css",
        "javascript",
        "sql",
        "java"
    ],
    "interpersonal_skills": [],
    "project_insights": [
        "rohit kumar software developer eager to solve real world problem into feasible software solution rohitkr8568gmailcom 916206449606 patna india education btech cse bakhtiyarpur college of enginnering patna 122021 present 805 polytechnic state board of technical education sbte 062016 072019 83 matriculate dav high school patna 042014 062015 83 work experience salesforce developer intern salesforce 122023 022024 patna bihar during my virtual internship at sales