In [4]:
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import spacy
import re

In [14]:
# Load spaCy's pre-trained model
!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.2 MB/s eta 0:00:11
     -- ------------------------------------- 0.8/12.8 MB 1.2 MB/s eta 0:00:11
     --- ------------------------------------ 1.0/12.8 MB 1.2 MB/s eta 0:00:11
     ---- ----------------------------------- 1.3/12.8 MB 1.2 MB/s eta 0:00:10
     ---- ----------------------------------- 1.6/12.8 MB 1.1 MB/s eta 0:00:11
     ----- ---------------------------------- 1.8/12.8 MB 1.1 MB/s eta 0:00:10
     ------ --------------------------------- 2.1/12.8 MB 1.2 MB/s eta 0:00:10
     --------- ------------------------------ 2.9/12.8 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        # Try direct text extraction
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text

        if text.strip():
            return text.strip()
    except Exception as e:
        print(f"Direct text extraction failed: {e}")

    # Fallback to OCR for image-based PDFs
    print("Falling back to OCR for image-based PDF.")
    try:
        images = convert_from_path(pdf_path)
        for image in images:
            page_text = pytesseract.image_to_string(image)
            text += page_text + "\n"
    except Exception as e:
        print(f"OCR failed: {e}")

    return text.strip()

In [38]:
pdf_path = r"C:\Users\Admin\OneDrive\Desktop\codedot\backend\media\resume\bariankit_btech.pdf"
resume_text = extract_text_from_pdf(pdf_path)

print("\nExtracted Text from PDF:")
print(resume_text)


Extracted Text from PDF:
BARI ANKIT VINOD
github.com/OnlyCR7 | linkedin.com/in/mycr7/ | vbari8527@gmail.com | +91-7875618947
Portfolio Website : https://onlycr7.github.io/DataSci_Labs_Portfolio/
Skills
Languages: C/C++, Java, JavaScript, SQL, Python
Technologies & Tools: DBMS, Tableau, PowerBI, Docker, ML & DL, NLP, Visualization, Computer Vision, TensorFlow,
PyTorch, LLMs
Education
Theem College of Engg. 2021 - 2022 - 2023
Diploma in Computer Science and Engineering Percentage: 85%
Relevant Coursework: Object Oriented Programming, Databases, Data Structures and Algorithms, Operating Systems, Computer
Networks, Computer Graphics
Vidyavardhini’s College of Engg. And Tech. 2023 - Now
B.E. in Artificial Intelligence and Data Science
Relevant Coursework: Object Oriented Programming, Databases, Discrete Maths, Applied Maths, Data Structures and Analysis
of Algorithms, Operating Systems, Computer Networks, Machine Learning, Data Mining, Advance Data Structures and
Algorithms, Information Re

In [30]:
# Function to extract Name using SpaCy NER
def extract_name(text):
    # Process the text with SpaCy
    doc = nlp(text)
    
    # Loop through named entities detected by SpaCy
    for ent in doc.ents:
        # If the entity is a person, return the name
        if ent.label_ == "PERSON":
            return ent.text
    return None

In [31]:
# Function to extract emails using spaCy and regex
def extract_email(text):
    # Use spaCy to process the text
    doc = nlp(text)
    
    # Find all potential email matches using regex
    emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    
    # If emails are found, return the list, otherwise return None
    return emails if emails else None

In [32]:
# Function to clean the text using spaCy
def clean_text(text):
    # Process text with spaCy
    doc = nlp(text)
    
    # Remove non-alphanumeric characters (excluding spaces and dashes in phone numbers)
    cleaned_text = ' '.join([token.text for token in doc if token.is_alpha or token.is_space or token.text in ['-', '+']])
    
    return cleaned_text

# Function to extract phone number using Regex
def extract_phone(text):
    # Clean the text first
    cleaned_text = clean_text(text)
    
    # Use regex to find phone number patterns
    phone = re.search(r"\+?\d[\d\s\-\(\)]{9,}\d", cleaned_text)
    
    return phone.group(0) if phone else None

In [33]:
# Function to extract skills using NLP techniques
def extract_skills(text):
    # Apply spaCy NLP pipeline
    doc = nlp(text)
    
    # List of possible skill-related categories or contexts
    skill_keywords = ["programming", "language", "tool", "technology", "framework", "library", "platform"]
    
    # Extract noun phrases that might represent skills
    skills = set()  # Using a set to avoid duplicates
    
    for np in doc.noun_chunks:
        # We look for noun phrases that might represent skills (e.g., programming languages, tools, etc.)
        if any(keyword in np.text.lower() for keyword in skill_keywords):
            skills.add(np.text.strip())
    
    # Filter the list to make it more meaningful (optional)
    # You can add logic to further filter based on predefined skill patterns
    
    return list(skills)

In [34]:
def extract_education(text):
    # Use spaCy to process the text
    doc = nlp(text)
    education = []

    # Define regular expressions for patterns like degrees, years, etc.
    degree_keywords = r"\b(Bachelor|Master|PhD|Diploma|Degree|Engineering|Science|Arts)\b"
    institution_keywords = r"\b(College|University|Institute|Academy)\b"
    
    # Iterate over sentences
    for sent in doc.sents:
        # Extract sentences with keywords related to education
        if re.search(degree_keywords, sent.text, re.IGNORECASE) or re.search(institution_keywords, sent.text, re.IGNORECASE):
            education.append(sent.text.strip())

    # Further process education entries to refine results (like adding the institution, degree, and years)
    refined_education = []
    for entry in education:
        # Extract degree, institution, and year/duration using regex
        degree_match = re.search(degree_keywords, entry)
        institution_match = re.search(institution_keywords, entry)
        year_match = re.search(r"\d{4}(-\d{4})?", entry)

        # Build a structured education entry
        education_entry = {}

        if degree_match:
            education_entry["Degree"] = degree_match.group(0)
        if institution_match:
            education_entry["Institution"] = institution_match.group(0)
        if year_match:
            education_entry["Year"] = year_match.group(0)

        if education_entry:
            refined_education.append(education_entry)

    return refined_education

In [35]:
# Function to clean and extract Experience using NLP
def extract_experience(text):
    # Step 1: Preprocess the text
    text = text.replace("\n", " ")  # Replace newlines with space
    doc = nlp(text)  # Parse the text using spaCy NLP model
    
    # Step 2: Identify sentences with experience-related terms
    experience_sentences = []
    
    # We'll look for common phrases related to work experience, projects, and roles
    experience_keywords = ['project', 'work', 'experience', 'role', 'responsibilities', 'intern', 'development', 'managed']
    
    for sent in doc.sents:
        if any(keyword in sent.text.lower() for keyword in experience_keywords):
            experience_sentences.append(sent.text.strip())
    
    # Step 3: Further clean up by extracting relevant entities like organizations, roles, and dates
    experience_data = []
    for sent in experience_sentences:
        entities = []
        for ent in nlp(sent).ents:
            if ent.label_ in ["ORG", "DATE", "GPE", "PERSON", "WORK_OF_ART"]:  # Look for orgs, dates, roles
                entities.append(ent.text)
        
        # Only append sentences with some useful entities
        if entities:
            experience_data.append({"sentence": sent, "entities": entities})
    
    return experience_data

In [36]:
# Call functions and print the extracted details
name = extract_name(resume_text)
email = extract_email(resume_text)
phone = extract_phone(resume_text)
skills = extract_skills(resume_text)
experience = extract_experience(resume_text)
education = extract_education(resume_text)

In [37]:
# Print the results
print("Name:", name)
print("Email:", email)
print("Phone:", phone)
print("Skills:", skills)
print("Experience:", experience)
print("Education:", education)

Name: Quizz Website
Email: ['vbari8527@gmail.com']
Phone: None
Skills: ['programming languages', 'programming section', 'programming part']
Experience: [{'sentence': 'ANKIT BARI PROFILE EDUCATION BARI ANKIT VINOD Diploma in Computer Engineering-Theem College 2021 - 2023 CS Student I learn more about networking, programming languages and databases.', 'entities': ['ANKIT', 'Computer Engineering-Theem College', '2021 - 2023']}, {'sentence': 'Palghar Gungwada PROJECT EXPERIENCE Pin - 401601 Land Area Calculation in Python (I am performing programming part of our project.)', 'entities': ['Palghar Gungwada PROJECT']}, {'sentence': 'We are getting API key of Google Map and using some Machine Learning concepts we are performed this project.', 'entities': ['API', 'Machine Learning']}, {'sentence': 'Quizz Website in Html,CSS and JS (I am doing programming section of our project.)', 'entities': ['Quizz Website', 'Html', 'CSS']}, {'sentence': 'Python Programming Learn New Skills Attendance Managem