In [1]:
!apt install tesseract-ocr poppler-utils libtesseract-dev
!pip install pytesseract pdf2image spacy scikit-learn pandas PyMuPDF python-Levenshtein
!python -m spacy download en_core_web_lg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following additional packages will be installed:
  libarchive-dev libleptonica-dev
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev poppler-utils
0 upgraded, 4 newly installed, 0 to remove and 34 not upgraded.
Need to get 3,929 kB of archives.
After this operation, 16.7 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.4 [581 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.7 [186 kB]
Fetched 3,929 kB in 1s (4,628 kB/

In [2]:
import os
import zipfile
import fitz
from pdf2image import convert_from_path
import pytesseract
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from google.colab import files
from PIL import Image
import io
import re

In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

In [5]:
uploaded = files.upload()
zip_filename = next(iter(uploaded))
extract_folder = 'Extracted_CVs'

with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

Saving CVs_Data Science.zip to CVs_Data Science.zip


In [6]:
def preprocess_image(image):
    image = image.convert('L')
    return image

In [7]:
def extract_text_with_ocr(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    full_text = ""
    confidences = []

    for page in pages:
        preprocessed_image = preprocess_image(page)
        ocr_result = pytesseract.image_to_data(preprocessed_image, output_type=pytesseract.Output.DICT)
        text = " ".join([word for word in ocr_result['text'] if word.strip() != ""])
        full_text += text + " "

        conf = [int(conf) for conf in ocr_result['conf'] if conf.isdigit()]
        if conf:
            confidences.append(sum(conf) / len(conf))

    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
    return full_text, avg_confidence


In [16]:
def extract_text_with_ocr(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    full_text = ""
    confidences = []

    for page in pages:
        preprocessed_image = preprocess_image(page)
        ocr_result = pytesseract.image_to_data(preprocessed_image, output_type=pytesseract.Output.DICT)
        text = " ".join([word for word in ocr_result['text'] if word.strip() != ""])
        full_text += text + " "

        # The elements in ocr_result['conf'] are likely already integers.
        # Filter out -1 values (represent no confidence) and calculate average.
        conf = [c for c in ocr_result['conf'] if c != -1]
        if conf:
            confidences.append(sum(conf) / len(conf))

    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
    return full_text, avg_confidence

In [17]:
# Helper function: NLP text preprocessing
def spacy_preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and len(token) > 2]
    return " ".join(tokens)

In [14]:
job_description = """
Requirements:
Bachelor’s or Master’s degree in Data Science.
2+ years of experience in a data science or similar analytical role.

Proficient in Python or R, and SQL.

Experience with machine learning libraries (e.g., scikit-learn, TensorFlow, XGBoost).

Strong understanding of statistical analysis and modeling techniques.

Experience with data visualization tools such as Tableau, Power BI, or Matplotlib/Seaborn.

Knowledge of big data tools like Spark, Hadoop, or cloud platforms (AWS, GCP, Azure) is a plus.

Excellent problem-solving skills and attention to detail.

Strong communication skills with the ability to explain complex technical concepts to non-technical stakeholders.

Preferred Qualifications:
Experience with time series forecasting, NLP, or recommendation systems.

Familiarity with version control systems like Git.

Experience working in agile environments.
"""

In [18]:
# Process all CVs
cv_data = []

for filename in os.listdir(extract_folder):
    if filename.lower().endswith('.pdf'):
        pdf_path = os.path.join(extract_folder, filename)
        raw_text, ocr_confidence = extract_text_from_pdf(pdf_path)
        processed_text = spacy_preprocess(raw_text)
        cv_data.append({
            'filename': filename,
            'raw_text': raw_text,
            'processed_text': processed_text,
            'ocr_confidence': ocr_confidence
        })

In [19]:
job_processed = spacy_preprocess(job_description)
job_doc = nlp(job_processed)

In [20]:
results = []

for cv in cv_data:
    cv_doc = nlp(cv['processed_text'])
    similarity = job_doc.similarity(cv_doc)
    results.append({
        'filename': cv['filename'],
        'similarity_score': similarity
    })

In [25]:
results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)
top_5 = results[:5]

In [26]:
df = pd.DataFrame(top_5)
pd.set_option('display.max_colwidth', 300)
print("\nTop 5 CVs based on Semantic Similarity:")
display(df)


Top 5 CVs based on Semantic Similarity:


Unnamed: 0,filename,similarity_score
0,John Doe.pdf,0.968748
1,DanBrown.pdf,0.949401
2,Evelyn White_CV.pdf,0.942909
3,Isabella Clark.pdf,0.93636
4,Russel.pdf,0.931418


In [27]:
df.to_csv('CV_Ranking_Results.csv', index=False)
files.download('CV_Ranking_Results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>