In [3]:
import pandas as pd

hardskill = pd.read_csv('HardSkill.csv')

In [4]:
hardskill_list = hardskill['label'].tolist()

In [None]:
# ! pip install pdf2image
# ! pip3 install pdfminer.six

# ! pip install opencv-python-headless
# ! pip3 install torch torchvision torchaudio
# ! pip install easyocr

In [16]:
# to build some OCR extractors

# Import Reader from easyocr and convert_from_path from pdf2image
from easyocr import Reader
from pdf2image import convert_from_path
from PIL import Image
import numpy as np

import tempfile
import os
import csv
import re
from pdfminer.high_level import extract_text
from googletrans import Translator

# define the extract function
def read_text(pdf_path, reader, in_line=True):
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, output_folder=path, poppler_path="/opt/homebrew/bin/")  # Convert the PDF to images
        text = []
        for image in images:
            image_np = np.array(image)
            if in_line:
                lines = reader.readtext(image_np, detail=0, paragraph=True)
            else:
                lines = reader.readtext(image_np, detail=0)
            for line in lines:
                text.append(line)
        return "\n".join(text)
    
# Load models for official UN languages
reader_lang_map = {
    'fr': Reader(['en', 'fr']),
    'ar': Reader(['en', 'ar']),
    'zh-cn': Reader(['en', 'ch_sim']),
    'ru': Reader(['en', 'ru']),
    'es': Reader(['en', 'es'])
}

def translate_text(text, source='fr', target='en'):
    translator = Translator()
    translation = translator.translate(text, src=source, dest=target)
    return translation.text

# Define the list of hard skills you want to extract
hard_skills = hardskill_list

# Define the path to the folder containing the CVs
cv_folder = '80_CV'

# Create an empty list to store the extracted hard skills
extracted_skills_list = []

# Iterate through the PDF files in the folder
for filename in os.listdir(cv_folder):
    if filename.endswith('.pdf'):
        cv_path = os.path.join(cv_folder, filename)
        
        # Convert the PDF to text
        cv_text = extract_text(cv_path)

        source_lang = 'en'
        
        # If the text extracted is None or an empty string, use OCR to extract the text
        if cv_text is None or cv_text.strip() == '':
            for lang, reader in reader_lang_map.items():
                cv_text = read_text(cv_path, reader)
                if cv_text is not None and cv_text.strip() != '':
                    source_lang = lang
                    break
        
        # If the text is still None or an empty string, skip this iteration
        if cv_text is None or cv_text.strip() == '':
            continue
        
        # Preprocess the text (remove newlines, extra spaces, etc.)
        cv_text = ' '.join(cv_text.split())
        
        # perform the translation
        cv_text_translated = translate_text(cv_text, source=source_lang, target='en')
        
        # Extract the hard skills from the CV
        # extracted_skills = [skill for skill in hard_skills if re.search(r'\b{}\b'.format(re.escape(skill)), cv_text_translated, re.IGNORECASE)]
        extracted_skills = [skill for skill in hard_skills if re.search(r'(?:^|\s){}(?:$|\s)'.format(re.escape(skill)), cv_text_translated, re.IGNORECASE)]

        
        # Store the extracted hard skills for the current CV
        extracted_skills_list.append({'filename': filename, 'hardskill': ', '.join(extracted_skills)})

# Define the path for the output CSV file
output_csv = 'extracted_skills_trans_to_English.csv'

# Write the extracted skills to the CSV file
with open(output_csv, 'w', newline='') as file:
    fieldnames = ['filename', 'hardskill']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(extracted_skills_list)

print("Extracted hard skills have been saved in:", output_csv)


Extracted hard skills have been saved in: extracted_skills_trans_to_English.csv


aboout the regex: chatGPT said

However, the issue with your use case (xxx.com should not be extracted) might be arising due to the presence of a dot (.) which is a non-word character. In the regular expression world, the dot is not considered as part of the word. Therefore, a word boundary (\b) can occur before or after a dot.

To fix this, we need to change the definition of what we consider a word boundary. Instead of using "\b", we can use whitespace to denote a boundary. 
In this regex pattern:

(?:^|\s) matches the start of the string or a whitespace.
(?:$|\s) matches the end of the string or a whitespace.
This means it will only match your skill if it is either at the start or end of a string or surrounded by whitespace.