In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
from platform import python_version

print(python_version())

3.12.7


In [4]:
#initialize and load pretrained model from SpaCy which include tokenizer, tagger, parser,  NER, lemmatizer
import spacy
nlp = spacy.load("en_core_web_sm")

In [20]:
!pip install pytesseract



In [5]:
#step 1: functional block for extracting the resume texts 

from pdfminer.high_level import extract_text as extract_text_from_pdf
from PIL import Image
import pytesseract
import docx
from docx import Document
import os
import re

def extract_text_from_image(file_path):
    img = Image.open(file_path)
    text = pytesseract.image_to_string(img)
    return text

def extract_text(file_path):
    extract = os.path.splitext(file_path)[-1].lower()
    
    if extract == ".pdf":
        return extract_text_from_pdf(file_path)
    elif extract == ".docx":
        docs = Document(file_path)
        return '\n'.join([para.text for para in docs.paragraphs])
    elif extract == ".txt":
        with open(file_path, 'r',encoding = 'utf-8', errors = 'ignore') as f:
            return f.read()
    elif extract in ['.png','.webp', '.jpg','.jpeg']:
        return extract_text_from_image(file_path)
    else:
        raise ValueError(f"unsupported file type : {extract}")

In [8]:
#to pass raw extracted text into nlp piplines which gives processed object doc
doc = nlp(extracted_text)

In [13]:
#step 2: to save and update extracted text file with log details
from datetime import datetime
from pdfminer.high_level import extract_text  # Make sure it's imported

# Step 1: Define the input file path
file_path = '../data/cv.pdf'  # Your input file

try:
    # Step 2: Extract text
    extracted_text = extract_text(file_path)
    print("Extracted Text:\n", extracted_text)

    # Step 3: Prepare output directory
    output_dir = "../data/extracted_text_sample"
    os.makedirs(output_dir, exist_ok=True)

    # Step 4: Generate output filename based on input file
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"{base_name}_extracted_{timestamp}.txt"
    dynamic_txt_file = os.path.join(output_dir, output_filename)

    # Step 5: Save extracted text to file
    with open(dynamic_txt_file, "w", encoding="utf-8") as out_file:
        out_file.write(extracted_text)

    # Step 6: Log saved file
    log_file_path = os.path.join(output_dir, "log.txt")
    with open(log_file_path, "a", encoding="utf-8") as log_file:
        log_file.write(f"{datetime.now()} - Saved: {output_filename}\n")

    print(f"✅ Text saved to: {dynamic_txt_file}")
    print(f"📝 Log updated at: {log_file_path}")

except Exception as e:
    print("❌ Error:", e)


Extracted Text:
 A s h o k   L a m s a l
Tech enthusiast, CS student at TU

Khairahani-6, Chitwan ,  Parsa

9865254615

ashoklamsal007@gmail.com

 @ https://www.linkedin.com/in/ashok-lamsal-8576311b9/

Passionate, dedicated sixth-semester BSC CSIT student with a passion for technology. I am actively delving into Python for machine
learning and have solid foundation in C, C++, and data structures with  basic knowledge of web development. I am currently
exploring  elds of ML and data science using numpy pandas, matplotlib, seaborne and scikitlearn

EX PER IENCE

School Teacher

Shree basic School Basyouli

Work as lower secondary level Science and Math teacher. 

Project lead

Code For Change

Khairahani

Jul, 2023 - Jul, 2024

Chitwan

Jan, 2025 - Present

Lead the team of CFC Chitwan, a nonpro t, nonpolitical student community actively organizing events and workshops in the
Chitwan region. 

Microsoft learnt student ambassador

Microsoft

Beta MLSA participated in multiple events and 

In [7]:
#resume text preprocessing to get clear extracted text only

def clean_resume_text(text):
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Fix broken headers like "E X P E R I E N C E"
    headers = ['EXPERIENCE', 'PROJECTS', 'EDUCATION', 'SKILLS', 'CERTIFICATIONS', 'TRAINING', 'REFERENCES']
    for h in headers:
        broken = r'\s*'.join(h)  # turns 'EXPERIENCE' → 'E\s*X\s*P...'
        text = re.sub(broken, h, text, flags=re.IGNORECASE)

    # Standardize headers to uppercase
    text = re.sub(
        r'(?i)(Experience|Projects|Education|Skills|Certifications|Training|References)',
        lambda m: m.group().upper(),
        text
    )
    
    return text



In [8]:
def extract_section(text, section):
    """
    Extracts the content of a section from resume text.
    """
    pattern = rf"{section}\s*(.*?)(?=(EXPERIENCE|PROJECTS|EDUCATION|SKILLS|CERTIFICATIONS|TRAINING|REFERENCES|$))"
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else None


In [9]:

def extract_basic_fields(text):
    name = None

    # Clean text
    cleaned_text = text.replace('\r', '').replace('\t', '').replace('|', ' ')
    lines = cleaned_text.split('\n')[:10]

    #  Enhanced Name Extraction Logic (from Version 1) =
    for line in lines:
        # Fix OCR-broken names 
        joined = ''.join(line.split())

        # Step 1: Try to split on capital letters
        words = re.findall(r'[A-Z][a-z]{1,}', joined)
        if 2 <= len(words) <= 4:
            name = ' '.join(words)
            break

        # Step 2: Try normal full name pattern
        match = re.match(r"^[A-Z][a-z]+ [A-Z][a-z]+", line.strip())
        if match:
            name = match.group().strip()
            break

    #  Other basic details fields 
    email = re.findall(r"[\w\.-]+@[\w\.-]+", cleaned_text)
    phone = re.findall(r"\+?\d[\d \-\(\)]{8,14}\d", cleaned_text)
    linkedin = re.findall(r"(https?://[^\s]*linkedin\.com[^\s]*)", cleaned_text)
    github = re.findall(r"(https?://[^\s]*github\.com[^\s]*)", cleaned_text)

    return {
        "name": name,
        "email": email[0] if email else None,
        "phone": phone[0] if phone else None,
        "linkedin": linkedin[0] if linkedin else None,
        "github": github[0] if github else None
    }


In [10]:
#main parsing function
def parse_resume(text):
    text_clean = clean_resume_text(text)
    cleaned_text = text.replace('\r', '').replace('\t', '').replace('|', ' ')

    parsed = {
        **extract_basic_fields(cleaned_text),
        "education": extract_section(text_clean, "EDUCATION"),
        "experience": extract_section(text_clean, "EXPERIENCE"),
        "projects": extract_section(text_clean, "PROJECTS"),
        "skills": extract_section(text_clean, "SKILLS"),
        "certifications": extract_section(text_clean, "CERTIFICATIONS")
    }

    return parsed



In [14]:
#checking structured output for above parsed text 
with open(dynamic_txt_file, "r", encoding="utf-8") as f:
    resume_text = f.read()

parsed_resume = parse_resume(resume_text)
parsed_resume


{'name': 'Ashok Lamsal',
 'email': 'ashoklamsal007@gmail.com',
 'phone': '9865254615',
 'linkedin': 'https://www.linkedin.com/in/ashok-lamsal-8576311b9/',
 'github': None,
 'education': 'Bsc CSIT Birendra Multiple Campus SLC Daisy English Boarding Secondary School BIo-Science (11 and 12 ) GPA: 3.66 SEE Shree Khairahani Secondary School',
 'experience': 'School Teacher Shree basic School Basyouli Work as lower secondary level Science and Math teacher. Project lead Code For Change Khairahani Jul, 2023 - Jul, 2024 Chitwan Jan, 2025 - Present Lead the team of CFC Chitwan, a nonpro\x00t, nonpolitical student community actively organizing events and workshops in the Chitwan region. Microsoft learnt student ambassador Microsoft Beta MLSA participated in multiple events and recently organized an Azure fundamentals workshop.',
 'projects': 'Loading......',
 'skills': 'programming language : C, C++, Data science software libraries: Python (NumPy, Pandas,Maplotlib,Seaborn, Scikit learn), Soft ski

In [16]:
import pandas as pd

df = pd.DataFrame([parsed_resume])
df.head()


Unnamed: 0,name,email,phone,linkedin,github,education,experience,projects,skills,certifications
0,Ashok Lamsal,ashoklamsal007@gmail.com,9865254615,https://www.linkedin.com/in/ashok-lamsal-85763...,,Bsc CSIT Birendra Multiple Campus SLC Daisy En...,School Teacher Shree basic School Basyouli Wor...,Loading......,"programming language : C, C++, Data science so...",7 days data science workshop Code for Change M...
