In [None]:
# Install packages
%pip install -U hrflow
%pip install python-dotenv
%pip install geopy

In [None]:
# Import packages
import os
from dotenv import load_dotenv
load_dotenv()

# Retrieve Keys
API_KEY = os.getenv('API_KEY')
BOARD_KEY = os.getenv('BOARD_KEY')

print(API_KEY, BOARD_KEY)

# Hrflow
from hrflow import Hrflow
client = Hrflow(api_secret=API_KEY)

response = client.job.storing.list(board_keys=[BOARD_KEY])
print(response)
response = client.job.storing.get(board_key=BOARD_KEY, reference="175GKFH")
print(response)

# Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

In [13]:
class FranceTravail:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=options)
        self.driver.get('https://candidat.francetravail.fr/offres/emploi')
        self.geolocator = Nominatim(user_agent="hrflow")

    def clear_board(self):
        response = client.job.storing.list(board_keys=[BOARD_KEY])
        print(response)
        if (response['meta']['total'] == 1):
            response = client.job.storing.archive(board_key=BOARD_KEY, reference=response['data'][0]['reference'])
            print(response)
        else:
            for i in range(response['meta']['total']+1):
                response = client.job.storing.archive(board_key=BOARD_KEY, key=str(i))
                print(response)
        response = client.job.storing.list(board_keys=[BOARD_KEY])
        for i in response['data']:
            response = client.job.storing.archive(board_key=BOARD_KEY, key=i['key'])
            print(response)
        self.driver.quit()
        
    def scrape(self):
        try:
            driver = self.driver
            wait = WebDriverWait(driver, 10)
            response = client.job.storing.list(board_keys=[BOARD_KEY])
            if (response['meta']['total'] == 0):
                id = 0
            else:
                id = response['meta']['total']

            # Wait for the page to load
            wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "container-list")))
            categories_obj = driver.find_element(By.CLASS_NAME, 'container-list')
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, 'a')))
            categories = categories_obj.find_elements(By.TAG_NAME, 'a')
            categories_list = [category.get_attribute('href') for category in categories]
            
            # CATEGORY
            for cateogory_url in categories_list:
                driver.get(cateogory_url)
                # Wait for the page to load
                jobs_tmp = driver.find_element(By.CLASS_NAME, 'container-list')
                jobs = jobs_tmp.find_elements(By.TAG_NAME, 'a')
                jobs_list = [job.get_attribute('href') for job in jobs]

                # JOB NAME
                for job_url in jobs_list:
                    driver.get(job_url)
                    # Check if there is a job offer
                    if (driver.find_elements(By.CLASS_NAME, 'result-list') == []):
                        continue
                    # Wait for the page to load
                    jobs_tmp = driver.find_element(By.CLASS_NAME, 'result-list')
                    jobs_title = jobs_tmp.find_elements(By.CLASS_NAME, 'media')
                    jobs_offer_tuple = [job_title.get_attribute('href') for job_title in jobs_title]

                    # JOB OFFER
                    for job_offer_url in jobs_offer_tuple:
                        driver.get(job_offer_url)
                        # Offer details
                        try:
                            offer = driver.find_element(By.CLASS_NAME, 'modal-details-offre')
                        except NoSuchElementException as e:
                            continue
                        offer_title = offer.find_elements(By.CLASS_NAME, 'title')
                        # If the offer is not empty
                        if (offer != None and offer_title != None and offer_title[0].text != None):
                            # Variables
                            ref = offer_title[0].text.split("\n")[0] if len(offer_title[0].text.split("\n")) >= 1 else None
                            # Check if the offer already exists in the board
                            response_get = client.job.storing.get(board_key=BOARD_KEY, reference=ref.split(" ")[-1])
                            if (response_get['code'] == 200 or response_get['code'] != 400):
                                print(f"Offer {ref.split(" ")[-1]} already exists")
                                continue
                            # Variables
                            try:
                                title = offer_title[0].text.split("\n")[1] if len(offer_title[0].text.split("\n")) >= 2 else None
                                description = offer.find_element(By.CLASS_NAME, 'description').text
                                location = offer.find_element(By.CLASS_NAME, 'title-complementary')
                                location_text = location.find_elements(By.TAG_NAME, 'span')[0].text if len(location.find_elements(By.TAG_NAME, 'span')) >= 1 else None
                                url = driver.current_url
                                responsabilities = offer.find_elements(By.CLASS_NAME, 'description')[0].text if len(offer.find_elements(By.CLASS_NAME, 'description')) >= 1 else None
                                salary_expectation = offer.find_elements(By.TAG_NAME, 'dd')[2].text if len(offer.find_elements(By.TAG_NAME, 'dd')) >= 3 else None
                                company = offer.find_elements(By.CLASS_NAME, 'media-body')[0].text.split("\n")[0] if len(offer.find_elements(By.CLASS_NAME, 'media-body')) >= 1 else None
                                weekly_working_hours = offer.find_elements(By.TAG_NAME, 'dd')[1].text if len(offer.find_elements(By.TAG_NAME, 'dd')) >= 2 else None
                                job_type = offer.find_elements(By.TAG_NAME, 'dd')[0].text if len(offer.find_elements(By.TAG_NAME, 'dd')) >= 1 else None
                            except NoSuchElementException as e:
                                print("Error NoSuchElement :\n", str(e))
                                continue
                            try:
                                position = self.geolocator.geocode(location_text)
                                (lat, lng) = (position.latitude, position.longitude) if position else (None, None)
                            except (GeocoderTimedOut, GeocoderServiceError) as e:
                                print(f"Geocoding error for location {location_text}: {e}")
                                lat, lng = None, None
                            try:
                                requirements = offer.find_element(By.XPATH, '//span[@itemprop="experienceRequirements"]')
                            except NoSuchElementException as e:
                                requirements = None
                            try:
                                skills = offer.find_elements(By.CLASS_NAME, 'skill-subtitle')  # ['Expérience', 'Formation', 'Compétences', 'Permis', 'Savoir-être professionnels']
                            except NoSuchElementException as e:
                                skills = None
                            skills_text = [skill.text for skill in skills] if skills != None else None
                            try:
                                industry = offer.find_element(By.XPATH, '//span[@itemprop="industry"]').text
                            except NoSuchElementException as e:
                                industry = None

                            
                            # Fill the dictionary
                            ## Expérience && Formation && Permis = requirements
                            require = ""
                            if ('Expérience' in skills_text):
                                require += "Expérience : " + requirements.text + "\n"
                            if ('Formations' in skills_text or 'Formation' in skills_text):
                                require += "Formations : " + requirements.text + "\n"
                            if ('Permis' in skills_text):
                                require += "Permis : " + offer.find_element(By.CLASS_NAME, 'skill-permis').text + "\n"

                            ## Compétences (itemprop)
                            skills = []
                            if ('Compétences' in skills_text):
                                try:
                                    hard_skills = offer.find_elements(By.XPATH, '//span[@itemprop="skills"]')
                                    for skill in hard_skills:
                                        skills.append({"name": skill.text, "value": None, "type": "hard"})
                                except NoSuchElementException as e:
                                    print("Error NoSuchElement :\n", str(e))
                            ## Savoir-être professionnels (no itemprop)
                            if ('Savoir-être professionnels' in skills_text):
                                try:
                                    soft_skills = offer.find_elements(By.CLASS_NAME, 'skill-list')
                                    for skill in soft_skills[skills_text.index('Savoir-être professionnels')].text.split("\n"):
                                        skills.append({"name": skill, "value": None, "type": "soft"})
                                except NoSuchElementException as e:
                                    print("Error NoSuchElement :\n", str(e))
                            json = {
                                "key": str(id),
                                "reference": ref.split(" ")[-1],
                                "name": title,
                                "location": {
                                    "text": location_text,
                                    "lat": lat,
                                    "lng": lng 
                                },
                                "sections": [],
                                "responsibilities": responsabilities,
                                "requirements": require,
                                "url": url,
                                "summary": description,
                                "skills": skills,
                                "tags": [
                                    {"name": "salary_expectation", "value": salary_expectation},
                                    {"name": "company", "value": company},
                                    {"name": "industry", "value": industry},
                                    {"name": "weekly_working_hours", "value": weekly_working_hours},
                                    {"name": "job_type", "value": job_type}
                                ]
                            }
                            response = client.job.storing.add_json(board_key=BOARD_KEY, job_json=json)
                            if (response['code'] == 201):
                                id += 1
                            print(response)
        except Exception as e:
            print("Error :\n", str(e))
        #finally:
            #self.driver.quit()

In [None]:
FT = FranceTravail()
# Start scraping
FT.scrape()

In [None]:
FT = FranceTravail()
# Clear the board
FT.clear_board()