In [16]:
# ================================
# STEP 1: Install OCR Dependencies
# ================================
!apt-get update -y #Ensures you get the latest versions of system tools
!apt-get install -y tesseract-ocr poppler-utils #tesseract ocr is to extract from images, and poppler is used to read pdfs
!pip install pytesseract pillow opencv-python pdf2image

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entr

In [17]:
# ================================
# STEP 2: Import Required Libraries
# ================================
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from google.colab import files #enables file upload and download
import cv2
import numpy as np
import re
import os

In [18]:
# ================================
# STEP 3: Upload CV File (PDF or Image)
# ================================
print("Upload your CV (PDF / PNG / JPG):")
uploaded = files.upload() #Opens a file picker in Colab

filename = list(uploaded.keys())[0]#Gets the uploaded file name
                                   #Assumes only one file is uploaded
print(f"Uploaded file: {filename}")

Upload your CV (PDF / PNG / JPG):


Saving PrajjwalNakarmiCV.pdf to PrajjwalNakarmiCV (2).pdf
Uploaded file: PrajjwalNakarmiCV (2).pdf


In [19]:
# ================================
# STEP 4: Image Preprocessing Function
# ================================
def preprocess_image(pil_image):
    img = np.array(pil_image)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) #Converts the image to grayscale
    threshold = cv2.threshold(  #Applies binary thresholding
        gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )[1]
    return threshold #Returns the cleaned image

In [20]:
# ================================
# STEP 5: OCR Function for Image
# ================================
def ocr_image(image):
    processed = preprocess_image(image)
    text = pytesseract.image_to_string(processed)
    return text

In [21]:
# ================================
# STEP 6: OCR Logic for PDF or Image
# ================================
extracted_text = ""

if filename.lower().endswith(".pdf"): #Checks if the uploaded file is a PDF
    print("Processing PDF file...")
    pages = convert_from_path(filename) #Converts each PDF page into an image
    for i, page in enumerate(pages): #Loops through all PDF pages
        print(f"OCR on page {i+1}")
        extracted_text += ocr_image(page) + "\n" #Runs OCR on each page, Appends extracted text
else:
    print("Processing Image file...")
    image = Image.open(filename)
    extracted_text = ocr_image(image)

Processing PDF file...
OCR on page 1
OCR on page 2


In [22]:
# ================================
# STEP 7: Clean Extracted Text
# ================================
def clean_text(text):
    text = re.sub(r'\s+', ' ', text) #Removes extra spaces, tabs, and line breaks
    text = text.replace('|', '') #Removes common OCR noise character
    return text.strip()

cleaned_text = clean_text(extracted_text) #Applies cleaning to OCR output

In [23]:
# ================================
# STEP 8: Show Output
# ================================
print("\n===== EXTRACTED CV TEXT =====\n")
print(cleaned_text)



===== EXTRACTED CV TEXT =====

PRAJJWAL NAKARMI Tamshipakha, Kathmandu  9840196565  prajjwalnakarmi3@gmail.com  Github  Linkedin CAREER OBJECTIVE Aspiring to become a proficient and innovative Software Developer or Full Stack Developer by leveraging a solid foundation in Computing and Artificial Intelligence.  am passionate about designing and building intelligent, scalable, and user-centric applications that address real-world challenges. With a continuous drive for learning and growth,  aim to contribute to the development of cutting-edge technological solutions across diverse domains.  thrive in collaborative environments where  can apply my technical skills, problem- solving mindset, and adaptability to deliver impactful results and push the boundaries of what technology can achieve. EDUCATION BACKGROUND BSc (Hons) Computing with Artificial Intelligence 2023 - Present e Islington College, London Metropolitan University * Kamal Marg, Kamalpokhari, Kathmandu AISSCE 2021 - 2023 e Mod

In [24]:
# ================================
# STEP 9: Save OCR Output
# ================================
output_file = "extracted_cv_text.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>