# Resume Dataset

## Getting the data

### Selecting 20 random resume samples

In [1]:
import kagglehub
import os
import random

# Download latest version
path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")

print("Path to dataset files:", path)

# Get a list of all files in the dataset directory
all_files = []
for root, dirs, files in os.walk(path):
    for file in files:
        all_files.append(os.path.join(root, file))

# Get 20 random samples
random_samples = random.sample(all_files, 20)

print("\n20 Random samples from the dataset:")
for sample in random_samples:
    print(sample)

Using Colab cache for faster access to the 'resume-dataset' dataset.
Path to dataset files: /kaggle/input/resume-dataset

20 Random samples from the dataset:
/kaggle/input/resume-dataset/data/data/TEACHER/10504237.pdf
/kaggle/input/resume-dataset/data/data/ARTS/20488267.pdf
/kaggle/input/resume-dataset/data/data/ADVOCATE/17847636.pdf
/kaggle/input/resume-dataset/data/data/APPAREL/19714635.pdf
/kaggle/input/resume-dataset/data/data/ADVOCATE/28206098.pdf
/kaggle/input/resume-dataset/data/data/BUSINESS-DEVELOPMENT/29908929.pdf
/kaggle/input/resume-dataset/data/data/DIGITAL-MEDIA/27419236.pdf
/kaggle/input/resume-dataset/data/data/ACCOUNTANT/19446337.pdf
/kaggle/input/resume-dataset/data/data/APPAREL/12122372.pdf
/kaggle/input/resume-dataset/data/data/CHEF/18036030.pdf
/kaggle/input/resume-dataset/data/data/FINANCE/39675895.pdf
/kaggle/input/resume-dataset/data/data/DESIGNER/14014749.pdf
/kaggle/input/resume-dataset/data/data/BANKING/17818707.pdf
/kaggle/input/resume-dataset/data/data/FINA

### Converting pdf to image

In [2]:
!apt-get install poppler-utils
!pip install pdf2image

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.11 [186 kB]
Fetched 186 kB in 1s (269 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Suc

In [3]:
from pdf2image import convert_from_path

output_folder = "/content/resume_images"
os.makedirs(output_folder, exist_ok=True)

for pdf_path in random_samples:
    try:
        pages = convert_from_path(pdf_path)
        image_name = os.path.basename(pdf_path).replace('.pdf', '.jpg')
        image_path = os.path.join(output_folder, image_name)
        pages[0].save(image_path, 'JPEG')
        print(f"Converted {pdf_path} to {image_path}")
    except Exception as e:
        print(f"Error converting {pdf_path}: {e}")

Converted /kaggle/input/resume-dataset/data/data/TEACHER/10504237.pdf to /content/resume_images/10504237.jpg
Converted /kaggle/input/resume-dataset/data/data/ARTS/20488267.pdf to /content/resume_images/20488267.jpg
Converted /kaggle/input/resume-dataset/data/data/ADVOCATE/17847636.pdf to /content/resume_images/17847636.jpg
Converted /kaggle/input/resume-dataset/data/data/APPAREL/19714635.pdf to /content/resume_images/19714635.jpg
Converted /kaggle/input/resume-dataset/data/data/ADVOCATE/28206098.pdf to /content/resume_images/28206098.jpg
Converted /kaggle/input/resume-dataset/data/data/BUSINESS-DEVELOPMENT/29908929.pdf to /content/resume_images/29908929.jpg
Converted /kaggle/input/resume-dataset/data/data/DIGITAL-MEDIA/27419236.pdf to /content/resume_images/27419236.jpg
Converted /kaggle/input/resume-dataset/data/data/ACCOUNTANT/19446337.pdf to /content/resume_images/19446337.jpg
Converted /kaggle/input/resume-dataset/data/data/APPAREL/12122372.pdf to /content/resume_images/12122372.jp

## Image Preprocesing: OpenCV

In [4]:
! pip install opencv-python matplotlib numpy



In [5]:
import matplotlib.pyplot as plt
import cv2
import numpy as np

def display_image(image, title="Image"):
    plt.figure(figsize=(7, 7))
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title(title)
    plt.axis('off')
    plt.show()

In [7]:
# Convert the image to grayscale
def convert_to_grayscale(image):
  return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

In [8]:
def reduce_noise(gray_image):
  return cv2.GaussianBlur(gray_image, (5, 5), 0)

In [9]:
def binarize_image(blur_reduced_image):
  return cv2.adaptiveThreshold(
    blur_reduced_image,
    255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY, # Invert the colors (text becomes white because of matplotlib)
    11, # Block size
    4  # Constant C
  )

In [10]:
def deskew_image(image):
    """
    Corrects the skew of an image by finding the minimum area rectangle
    of the text block and rotating accordingly.
    """
    # Find all non-zero (white) pixels
    coords = cv2.findNonZero(image)

    # Get the minimum area bounding rectangle
    # It returns (center(x,y), (width, height), angle of rotation)
    rect = cv2.minAreaRect(coords)
    angle = rect[-1] - 90

    # The `cv2.minAreaRect` angle has a specific range.
    # We need to adjust it for our rotation.
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = angle

    # Get the rotation matrix and rotate the image
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h),
                             flags=cv2.INTER_CUBIC,
                             borderMode=cv2.BORDER_REPLICATE)
    print(f"Detected skew angle: {angle:.2f} degrees")

    # Now, rotate the original grayscale image by the same angle
    (h, w) = rotated.shape
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    deskewed_gray = cv2.warpAffine(rotated, M, (w, h),
                                  flags=cv2.INTER_CUBIC,
                                  borderMode=cv2.BORDER_REPLICATE)

    return deskewed_gray

Let's run the above code for all images

In [11]:
def process_one_image(image):
  image = convert_to_grayscale(image)
  print("Converted image to grayscale..")
  image = reduce_noise(image)
  print("Reduced noise in the image..")
  image = binarize_image(image)
  print("Binarized the image..")
  image = deskew_image(image)
  print("Corrected image orientation..")
  return image

In [12]:
import time
input_folder_path = "/content/resume_images"
output_folder_path = "/content/processed_images" # Changed output folder name

start_time = time.time()

if os.makedirs(output_folder_path, exist_ok=True):
  print(f"Created folder: {output_folder_path}")

# Get a list of image files in the input folder
image_files = [f for f in os.listdir(input_folder_path) if f.endswith('.jpg')] # Filter for jpg files

print(f"Processing {len(image_files)} images...") # Print the number of images being processed

for i, image_name in enumerate(image_files[:20], 1): # Iterate through the image files
  print(f"Processing image {i}/{len(image_files)}: {image_name}")
  image_path = os.path.join(input_folder_path, image_name)
  image = cv2.imread(image_path)

  if image is None: # Check if image loading was successful
      print(f"Warning: Could not load image {image_path}. Skipping.")
      print("-" * 50)
      continue

  processed_image = process_one_image(image)
  # save image
  output_path = os.path.join(output_folder_path, image_name)
  cv2.imwrite(output_path, processed_image)
  print(f"Saved processed image to: {output_path}")
  print("-"*50)

print("Processing images is completed.")
print(f"Total time taken: {time.time() - start_time} seconds")

Processing 20 images...
Processing image 1/20: 10504237.jpg
Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Detected skew angle: 0.00 degrees
Corrected image orientation..
Saved processed image to: /content/processed_images/10504237.jpg
--------------------------------------------------
Processing image 2/20: 28206098.jpg
Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Detected skew angle: 0.00 degrees
Corrected image orientation..
Saved processed image to: /content/processed_images/28206098.jpg
--------------------------------------------------
Processing image 3/20: 18036030.jpg
Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Detected skew angle: 0.00 degrees
Corrected image orientation..
Saved processed image to: /content/processed_images/18036030.jpg
--------------------------------------------------
Processing image 4/20: 17252448.jpg
Converted image to grayscale..
Reduced noise i

### Installation
`pip install pytesseract pillow`


### Tesseract

For developers, integrating Tesseract into an application is straightforward using its API. Here is a simple example using the `pytesseract` wrapper in Python:

```python

from PIL import Image
import pytesseract


text = pytesseract.image_to_string(Image.open(filename))

print(text)
```



In [13]:
! pip install pytesseract pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [14]:
from PIL import Image
import pytesseract
import os

processed_image_folder = '/content/processed_images'

# Get a list of files in the processed images folder
processed_images = os.listdir(processed_image_folder)

# Check if there are any processed images
if processed_images:
    # Take the first image file found
    sample_image_name = processed_images[0]
    sample_image_path = os.path.join(processed_image_folder, sample_image_name)

    print(f"Using sample image: {sample_image_path}")

    # Perform OCR on the sample image
    text = pytesseract.image_to_string(Image.open(sample_image_path))

    print("\nExtracted Text:")
    print(text)
else:
    print(f"No processed images found in {processed_image_folder}")

Using sample image: /content/processed_images/10504237.jpg

Extracted Text:
BIOLOGY TEACHER

Experience

11/2016 to Current

Biology Teacher Company Name 1% City , State

* Execute, impkment, and modify kesson plans while incorporating differentiated instruction and multiple intelligences.

© Design and align lessons, labs, and assessments incorporating STEM, problem based karning, Common Core and NGSS.

© Volunteer and participate in schools extracurricular activities such as selling tickets for the school talent show and participating in the
Lindenwold HS 5K for the scholarship fimd.

© Teach and translate materials utilizing Sheltered Instruction techniques for English Language Learners.

© Co-teach with special education teachers while executing modifications in student IEP and 504 plans.

* Plan and present Google applications training for Lmdenwold HS professional development.

09/2011 to 11/2016
Biology Teacher Company Name 1% City, State

© Seck out of district professional dev

In [15]:
from PIL import Image
import pytesseract
import time

input_folder_path = "/content/processed_images"
output_folder_path = "/content/tesseract_output"
start_time = time.time()

if os.makedirs(output_folder_path, exist_ok=True):
  print(f"Created folder: {output_folder_path}")

total_images = sum(1 for entry in os.scandir(input_folder_path))
print(f"Total images in folder: {total_images}")

for i, image_name in enumerate(os.listdir(input_folder_path)[:20], 1):
  print(f"Processing image {i}/{total_images}: {image_name}")
  image_path = os.path.join(input_folder_path, image_name)
  print("Extracting text from image..")
  text = pytesseract.image_to_string(Image.open(image_path))
  output_path = os.path.join(output_folder_path, image_name.replace(".jpg", ".txt"))
  with open(output_path, "w") as f:
    f.write(text)

  print(f"Saved extracted text to {output_path}")
  print("-"*50)

print("Text Extraction Completed.")
print(f"Total time taken: {time.time() - start_time} seconds")

Total images in folder: 20
Processing image 1/20: 10504237.jpg
Extracting text from image..
Saved extracted text to /content/tesseract_output/10504237.txt
--------------------------------------------------
Processing image 2/20: 28206098.jpg
Extracting text from image..
Saved extracted text to /content/tesseract_output/28206098.txt
--------------------------------------------------
Processing image 3/20: 18036030.jpg
Extracting text from image..
Saved extracted text to /content/tesseract_output/18036030.txt
--------------------------------------------------
Processing image 4/20: 17252448.jpg
Extracting text from image..
Saved extracted text to /content/tesseract_output/17252448.txt
--------------------------------------------------
Processing image 5/20: 12547982.jpg
Extracting text from image..
Saved extracted text to /content/tesseract_output/12547982.txt
--------------------------------------------------
Processing image 6/20: 63137898.jpg
Extracting text from image..
Saved extract

## Information Extraction

In [16]:
prompt = """
Extract the following information from the given resume:
- Name
- Contact Information (phone, email, LinkedIn, etc.)
- Summary/Objective
- Work Experience (company, job title, dates, responsibilities)
- Education (degree, major, institution, graduation date)
- Skills

The resume has been processed (converted to grayscale, noise reduced, binarized, and deskewed) using OpenCV, and text has been extracted using Tesseract.
Use the extracted text as support for extracting information.
If you believe the text extraction is incorrect somewhere, you may correct it yourself and provide corrected information.
Always give your response in the following JSON format:
{
    "name": "NAME",
    "contact_information": {
        "phone": "PHONE_NUMBER",
        "email": "EMAIL_ADDRESS",
        "linkedin": "LINKEDIN_PROFILE_URL"
        // Add other relevant contact details
    },
    "summary": "SUMMARY_TEXT",
    "work_experience": [
        {
            "company": "COMPANY_NAME",
            "job_title": "JOB_TITLE",
            "dates": "START_DATE - END_DATE",
            "responsibilities": ["RESPONSIBILITY 1", "RESPONSIBILITY 2"]
        }
        // Add other work experiences
    ],
    "education": [
        {
            "degree": "DEGREE",
            "major": "MAJOR",
            "institution": "INSTITUTION_NAME",
            "graduation_date": "GRADUATION_DATE"
        }
        // Add other educational degrees
    ],
    "skills": ["SKILL 1", "SKILL 2"]
}
Respond with the extracted information only in the specified format.
Here is the extracted text:


"""

In [17]:
from google import genai
from google.colab import userdata # colab only code
from PIL import Image
import json
import time

In [19]:
import google.generativeai as genai

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [20]:
image_folder_path = "/content/processed_images"
text_folder_path = "/content/tesseract_output"
output_folder_path = "/content/json_output"

start_time = time.time()

if os.makedirs(output_folder_path, exist_ok=True):
  print(f"Created folder: {output_folder_path}")

total_images = sum(1 for entry in os.scandir(image_folder_path))
print(f"Total images in folder: {total_images}")

# Initialize the Gemini model
model = genai.GenerativeModel('gemini-2.5-flash')

for i, image_name in enumerate(os.listdir(image_folder_path)[:20], 1):
  print(f"Processing image {i}/{total_images}: {image_name}")
  image_path = os.path.join(image_folder_path, image_name)
  print(f"Loading image: {image_path}")
  with open(image_path, "rb") as f:
    image = Image.open(image_path)


  text_path = os.path.join(text_folder_path, image_name.replace(".jpg", ".txt"))
  print(f"Loading extracted text: {text_path}")
  with open(text_path, "r") as f:
    text = f.read()

  print("Extracting information from image and text..")

  full_prompt = prompt + text

  contents = [
        image,
        {
            "text": full_prompt
        }
    ]
  response = model.generate_content(contents=contents)

  # Access the usage_metadata attribute
  usage_metadata = response.usage_metadata

  # Print the different token counts
  print(f"Input Token Count: {usage_metadata.prompt_token_count}")
  print(f"Output Token Count: {usage_metadata.candidates_token_count}")
  print(f"Total Token Count: {usage_metadata.total_token_count}")

  try:
      extracted_information = json.loads(response.text.replace('```json', '').replace('```', ''))
      output_path = os.path.join(output_folder_path, image_name.replace(".jpg", ".json"))
      with open(output_path, "w") as f:
        json.dump(extracted_information, f, indent=4)

      print(f"Saved extracted information to {output_path}")
  except json.JSONDecodeError as e:
      print(f"Error decoding JSON for {image_name}: {e}")
      print(f"Response text: {response.text}")

  print("-"*50)
  time.sleep(1) # Added a small delay to avoid hitting API limits


print("Information Extraction Completed.")
print(f"Total time taken: {time.time() - start_time} seconds")

Total images in folder: 20
Processing image 1/20: 10504237.jpg
Loading image: /content/processed_images/10504237.jpg
Loading extracted text: /content/tesseract_output/10504237.txt
Extracting information from image and text..
Input Token Count: 1596
Output Token Count: 1606
Total Token Count: 5283
Saved extracted information to /content/json_output/10504237.json
--------------------------------------------------
Processing image 2/20: 28206098.jpg
Loading image: /content/processed_images/28206098.jpg
Loading extracted text: /content/tesseract_output/28206098.txt
Extracting information from image and text..
Input Token Count: 1321
Output Token Count: 903
Total Token Count: 4460
Saved extracted information to /content/json_output/28206098.json
--------------------------------------------------
Processing image 3/20: 18036030.jpg
Loading image: /content/processed_images/18036030.jpg
Loading extracted text: /content/tesseract_output/18036030.txt
Extracting information from image and text..


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 5129.14ms


Input Token Count: 1432
Output Token Count: 1035
Total Token Count: 2858
Saved extracted information to /content/json_output/12547982.json
--------------------------------------------------
Processing image 6/20: 63137898.jpg
Loading image: /content/processed_images/63137898.jpg
Loading extracted text: /content/tesseract_output/63137898.txt
Extracting information from image and text..
Input Token Count: 1464
Output Token Count: 1127
Total Token Count: 4205
Saved extracted information to /content/json_output/63137898.json
--------------------------------------------------
Processing image 7/20: 24668861.jpg
Loading image: /content/processed_images/24668861.jpg
Loading extracted text: /content/tesseract_output/24668861.txt
Extracting information from image and text..
Input Token Count: 1262
Output Token Count: 666
Total Token Count: 2527
Saved extracted information to /content/json_output/24668861.json
--------------------------------------------------
Processing image 8/20: 29908929.jpg

ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 5617.88ms


Input Token Count: 1828
Output Token Count: 1434
Total Token Count: 7259
Saved extracted information to /content/json_output/27419236.json
--------------------------------------------------
Processing image 14/20: 19714635.jpg
Loading image: /content/processed_images/19714635.jpg
Loading extracted text: /content/tesseract_output/19714635.txt
Extracting information from image and text..
Input Token Count: 1357
Output Token Count: 613
Total Token Count: 4484
Saved extracted information to /content/json_output/19714635.json
--------------------------------------------------
Processing image 15/20: 30397268.jpg
Loading image: /content/processed_images/30397268.jpg
Loading extracted text: /content/tesseract_output/30397268.txt
Extracting information from image and text..
Input Token Count: 1627
Output Token Count: 1139
Total Token Count: 4449
Saved extracted information to /content/json_output/30397268.json
--------------------------------------------------
Processing image 16/20: 20488267.