In [1]:
# ! pip install pdf2image
# ! pip3 install pdfminer.six

# ! pip install opencv-python-headless
# ! pip3 install torch torchvision torchaudio
# ! pip install easyocr

In [2]:
from langdetect import detect
from easyocr import Reader
from googletrans import Translator
import os
import csv
import re
from pdf2image import convert_from_path
from PIL import Image
import numpy as np
import tempfile
from pdfminer.high_level import extract_text

# get our list of hardskills of the HardSkill.csv file from database
import pandas as pd
hardskill = pd.read_csv('HardSkill.csv')
hardskill_list = hardskill['label'].tolist()


# define the extract function
def read_text(pdf_path, reader, in_line=True):
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, output_folder=path, poppler_path="/opt/homebrew/bin/")  # Convert the PDF to images
        text = []
        for image in images:
            image_np = np.array(image)
            if in_line:
                lines = reader.readtext(image_np, detail=0, paragraph=True)
            else:
                lines = reader.readtext(image_np, detail=0)
            for line in lines:
                text.append(line)
        return "\n".join(text)
    
 
# Define the path to the folder containing the CVs
cv_folder = '80_CV'

# Create an empty list to store the extracted hard skills
extracted_skills_list = []

# Create an empty list to store the file names for files that have no text or empty string after extraction
files_to_process_later = []

# Load OCR models for official UN languages
reader_lang_map = {
    'fr': Reader(['en', 'fr']),
    'ar': Reader(['en', 'ar']),
    'zh-cn': Reader(['en', 'ch_sim']),
    'ru': Reader(['en', 'ru']),
    'es': Reader(['en', 'es'])
}

# Define translator function
def translate_text(text, source='fr', target='en'):
    translator = Translator()
    translation = translator.translate(text, src=source, dest=target)
    return translation.text

def detect_language(text):
    try:
        return detect(text)
    except:
        return None

# pipeline 1: those text can be extracted by package
# Iterate through the PDF files in the folder
for filename in os.listdir(cv_folder):
    if filename.endswith('.pdf'):
        cv_path = os.path.join(cv_folder, filename)
        
        # Convert the PDF to text
        cv_text = extract_text(cv_path)

        # If the text extracted is None or an empty string, save the file to process later
        if cv_text is None or cv_text.strip() == '':
            files_to_process_later.append(filename)
            continue

        # Here is where you can use a language detection library to identify the language of the text
        source_lang = detect_language(cv_text)

        # perform the translation
        cv_text_translated = translate_text(cv_text, source=source_lang, target='en')
        
        # Extract the hard skills from the CV
        extracted_skills = [skill for skill in hard_skills if re.search(r'(?:^|\s){}(?:$|\s)'.format(re.escape(skill)), cv_text_translated, re.IGNORECASE)]
        
        # Store the extracted hard skills for the current CV
        extracted_skills_list.append({'filename': filename, 'hardskill': ', '.join(extracted_skills)})

        
# pipeline 2: those text can only be extracted by OCR
# Process the files that had no text or empty string after initial extraction
for filename in files_to_process_later:
    cv_path = os.path.join(cv_folder, filename)

    # Use OCR to extract the text
    for lang, reader in reader_lang_map.items():
        cv_text = read_text(cv_path, reader)
        if cv_text is not None and cv_text.strip() != '':
            source_lang = lang
            break
    
    # perform the translation
    cv_text_translated = translate_text(cv_text, source=source_lang, target='en')

    # Extract the hard skills from the CV
    extracted_skills = [skill for skill in hard_skills if re.search(r'(?:^|\s){}(?:$|\s)'.format(re.escape(skill)), cv_text_translated, re.IGNORECASE)]
        
    # Store the extracted hard skills for the current CV
    extracted_skills_list.append({'filename': filename, 'hardskill': ', '.join(extracted_skills)})

# Define the path for the output CSV file
output_csv = 'extracted_skills_trans_to_English.csv'

# Write the extracted skills to the CSV file
with open(output_csv, 'w', newline='') as file:
    fieldnames = ['filename', 'hardskill']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(extracted_skills_list)

print("Extracted hard skills have been saved in:", output_csv)


ModuleNotFoundError: No module named 'langdetect'