In [None]:
!apt install tesseract-ocr poppler-utils libtesseract-dev
!pip install pytesseract pdf2image spacy scikit-learn pandas PyMuPDF python-Levenshtein
!python -m spacy download en_core_web_lg

In [None]:
import os
import zipfile
import fitz
from pdf2image import convert_from_path
import pytesseract
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from google.colab import files
from PIL import Image
import io
import re

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

In [None]:
uploaded = files.upload()
zip_filename = next(iter(uploaded))
extract_folder = 'Extracted_CVs'

with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

Saving CVs_Data Science.zip to CVs_Data Science.zip


In [None]:
def preprocess_image(image):
    image = image.convert('L')
    return image

In [None]:
def extract_text_with_ocr(pdf_path):
    images = convert_from_path(pdf_path, dpi=300)
    full_text = []
    total_confidence = 0
    count = 0

    for image in images:
        processed_image = preprocess_image(image)
        data = pytesseract.image_to_data(processed_image, output_type=pytesseract.Output.DICT)

        for i, text in enumerate(data['text']):
            if text.strip():
                full_text.append(text)
                total_confidence += float(data['conf'][i])
                count += 1

        full_text.append('\n\n')

    avg_confidence = total_confidence / count if count > 0 else 0
    return ' '.join(full_text), avg_confidence

In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        # If text extraction yields minimal content, use OCR
        if len(text.strip()) < 100:
            return extract_text_with_ocr(pdf_path)
        return text, 100.0
    except:
        return extract_text_with_ocr(pdf_path)

In [None]:
def spacy_preprocess(text):
    text = text.lower().strip()  # Normalize text to lowercase and remove leading/trailing spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    doc = nlp(text)
    tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct and token.is_alpha:
            lemma = token.lemma_.lower().strip()
            if len(lemma) > 2:  # Remove short tokens
                tokens.append(lemma)
    return ' '.join(tokens)

In [None]:
job_description = """
Requirements:
Bachelor’s or Master’s degree in Data Science.
2+ years of experience in a data science or similar analytical role.

Proficient in Python or R, and SQL.

Experience with machine learning libraries (e.g., scikit-learn, TensorFlow, XGBoost).

Strong understanding of statistical analysis and modeling techniques.

Experience with data visualization tools such as Tableau, Power BI, or Matplotlib/Seaborn.

Knowledge of big data tools like Spark, Hadoop, or cloud platforms (AWS, GCP, Azure) is a plus.

Excellent problem-solving skills and attention to detail.

Strong communication skills with the ability to explain complex technical concepts to non-technical stakeholders.

Preferred Qualifications:
Experience with time series forecasting, NLP, or recommendation systems.

Familiarity with version control systems like Git.

Experience working in agile environments.
"""

In [None]:
cv_data = []
for filename in os.listdir(extract_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(extract_folder, filename)
        text, ocr_confidence = extract_text_from_pdf(pdf_path)
        processed_text = spacy_preprocess(text)
        cv_data.append({
            'filename': filename,
            'raw_text': text,
            'processed_text': processed_text,
            'ocr_confidence': ocr_confidence
        })
    else:
      print(f"Skipping non-PDF file: {filename}")

Skipping non-PDF file: .ipynb_checkpoints


In [None]:
job_processed = spacy_preprocess(job_description)
job_tokens = set(job_processed.split())
cv_data = [
    {
        **cv,
        'processed_tokens': set(cv['processed_text'].split())
    }
    for cv in cv_data
]

In [None]:
results = []
for cv in cv_data:
    matched_tokens = job_tokens.intersection(cv['processed_tokens'])
    match_percentage = len(matched_tokens) / len(job_tokens) if job_tokens else 0
    final_score = match_percentage * 10
    results.append({
        'Filename': cv['filename'],
        'Grade/10': round(final_score, 2),
        'Matched Tokens': list(matched_tokens)
    })

top_5 = sorted(results, key=lambda x: x['Grade/10'], reverse=True)[:5]

In [None]:
df = pd.DataFrame(top_5)
pd.set_option('display.max_colwidth', 300)
print("\nTop 5 CVs:")
display(df)


Top 5 CVs:


Unnamed: 0,Filename,Grade/10,Matched Tokens
0,John Minati.pdf,7.76,"[data, xgboost, spark, nontechnical, complex, power, concept, agile, system, familiarity, qualification, analysis, version, modeling, statistical, azure, git, bachelor, recommendation, proficient, cloud, communication, series, computer, tool, control, sql, tensorflow, nlp, aws, strong, degree, b..."
1,DanBrown.pdf,4.34,"[data, complex, technique, power, statistic, system, analysis, modeling, work, mathematic, statistical, git, recommendation, proficient, series, tool, aws, big, experience, skill, master, datum, stakeholder, time, python, machine, visualization, science, platform, year, forecast, learning, ability]"
2,Steven_Brandon_CV.pdf,4.21,"[data, spark, complex, analytical, statistic, system, analysis, work, bachelor, cloud, communication, series, computer, sql, aws, big, experience, skill, master, datum, time, field, python, machine, science, platform, year, tableau, like, gcp, learning, ability]"
3,Raymond-CV.pdf,3.29,"[data, spark, nontechnical, technique, power, analytical, system, analysis, modeling, bachelor, cloud, sql, tensorflow, nlp, big, experience, skill, datum, python, machine, hadoop, visualization, science, year, tableau]"
4,Isabella Clark.pdf,3.16,"[data, complex, technique, power, analysis, modeling, work, statistical, recommendation, aws, skill, experience, master, datum, stakeholder, time, python, machine, visualization, science, platform, year, learning, ability]"


In [None]:
df.to_csv('CV_Ranking_Results.csv', index=False)
files.download('CV_Ranking_Results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>