In [2]:
import requests
import html
from bs4 import BeautifulSoup
import math
import json
import tqdm
import openai

openai.api_key = # Entrez votre clef d'API openai


## Find job ids

In [3]:
# fonction qui permet de récupérer les id des offres d'emploi Linkedin
def makeUrl(params):
    url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?"
    for key, value in params.items():
        if type(value) == str:
            value = value.replace(" ", "%20")
        url += f"{key}={value}&"
    return url

In [4]:
# fonction qui boucle sur les pages de résultats pour récupérer les id des offres d'emploi Linkedin
def get_job_ids(params, max_items, save_path):
    """
    Save job ids to a json file
    :param params: dictionary of parameters (keywords, location, geoId)
    :param max_items: maximum number of items to retrieve
    :param save_path: path to save the json file
    """

    job_id_list = []
    url = makeUrl(params)
    # is_over = False
    start = 0
    while (start < max_items):
        try:
            res = requests.get(url + f"start={start}")
            soup = BeautifulSoup(res.text, 'html.parser')
            alljobs_on_this_page = soup.find_all("li")
            print(start, ':', len(alljobs_on_this_page))
            for i in range(0, len(alljobs_on_this_page)):
                try:
                    jobid = alljobs_on_this_page[i].find(
                        "div", {"class": "base-card"}).get('data-entity-urn').split(":")[3]
                    job_id_list.append(jobid)
                except:
                    is_over = True
                    break
            start += 25
        except:
            break
    job_id_list = list(set(job_id_list))
    print(f"Total job ids: {len(job_id_list)}")
    json_doc = {
        "keywords": params["keywords"],
        "location": params["location"],
        "geoId": params["geoId"],
        "job_id_list": job_id_list
    }
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(json_doc, f, ensure_ascii=False)

### Save the job ids in a json file

In [5]:
params = {
    "keywords": 'developpeur mobile',
    "location": 'France',
    "geoId": '105015875',
}
print(makeUrl(params))
get_job_ids(params, 1000, "scrapped_data/job_ids_2.json")

https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=developpeur%20mobile&location=France&geoId=105015875&


## Retrieve jobs information

In [7]:
# fonction qui permet de récupérer les informations sur une offre d'emploi Linkedin sous la forme d'un dictionnaire
def get_job_information(job_id):
    """
    Get job information from job id
    :param job_id: job id
    :return: dictionary of job information (company, job-title, location, posted, applicants, easy-apply, description)
    """
    
    url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    job_info = {}
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    # nom de l'entreprise
    try:
        job_info["company"] = soup.find(
            "div", {"class": "top-card-layout__card"}).find("a").find("img").get('alt')
    except:
        job_info["company"] = None
    # titre de l'annonce
    try:
        job_info["job-title"] = soup.find(
            "div", {"class": "top-card-layout__entity-info"}).find("a").text.strip()
    except:
        job_info["job-title"] = None

    # infos sur le poste
    try:
        for item in soup.find("ul", {"class": "description__job-criteria-list"}).find_all("li"):
            try:
                key = item.find("h3").text.strip()
                value = item.find("span").text.strip()
                job_info[key] = value
            except:
                pass
    except:
        pass

    # description du poste
    try:
        job_info["description"] = soup.find(
            "div", {"class": "show-more-less-html__markup"}).text.strip()
    except:
        job_info["description"] = None

    return job_info

In [8]:
# fonction qui permet de vérifier si une offre d'emploi est pertinente pour notre recherche (filtrage grossier)
def is_job_relevant(job_info):
    """
    Check if job is relevant
    :param job_info: dictionary of job information
    :return: True if job is relevant, False otherwise
    """
    
    if job_info["job-title"] is None or job_info["description"] is None:
        return False
    keywords = [' natif', ' native', ' hybrid', ' android', ' ios', 'kotlin', 'swift', 'flutter', 'react-native','react native', 'xamarin', 'ionic', 'java', 'objective-c', 'swiftui', 'react-native', 'swift', 'kotlin', 'flutter', 'xamarin', 'ionic', 'java', 'objective-c']
    for key in keywords:
        if key in job_info["description"].lower() or key in job_info["job-title"].lower():
            # print(f"Relevant job: {job_info['job-title']}", "keyword:", key)
            return True
    # print(f"Non-relevant job: {job_info['job-title']}")
    return False

def checkpoint_save(job_info_dict, save_path):
    """
    Save job information to a json file
    :param job_info_dict: dictionary of job information
    :param save_path: path to save the json file
    """
    try:
        with open(save_path, 'r', encoding='utf-8') as f:
            jobs_data = json.load(f)
    except FileNotFoundError:
        jobs_data = {}

    jobs_data.update(job_info_dict)

    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(jobs_data, f, ensure_ascii=False)


def get_all_jobs_information(job_ids, save_path, current_data = {}, save_every=10, except_if_exists=True):
    """
    Loop over job ids and get job information
    :param job_ids: list of job ids
    :param save_path: path to save the json file
    :param current_data: dictionary of current job information
    :param save_every: save every n items
    :param except_if_exists: except if job id exists in current data, or overwrite
    """
    job_info_dict = {}
    for i, job_id in enumerate(tqdm.tqdm(job_ids)):
        if except_if_exists and job_id in current_data and current_data[job_id] is not None:
            continue
        try:
            job_info = get_job_information(job_id)
            if not is_job_relevant(job_info):
                job_info_dict[job_id] = None
            else:
                job_info_dict[job_id] = job_info
            if (i + 1) % save_every == 0:
                checkpoint_save(job_info_dict, save_path)
                job_info_dict.clear()
        except Exception as e:
            print(f"Error processing job ID {job_id}: {e}")
    if job_info_dict:
        checkpoint_save(job_info_dict, save_path)


### Retrieve all job information and save it in a json file

In [9]:
try:
    current_data = json.load(open("scrapped_data/jobs_data_1.json", 'r', encoding='utf-8'))
except:
    current_data = {}

# list of all retrieved job ids
job_ids = json.load(open("scrapped_data/job_ids_1.json", 'r', encoding='utf-8')).get("job_id_list") + json.load(open("scrapped_data/job_ids_2.json", 'r', encoding='utf-8')).get("job_id_list") + json.load(open("scrapped_data/job_ids_3.json", 'r', encoding='utf-8')).get("job_id_list")
get_all_jobs_information(job_ids, 'scrapped_data/jobs_data_1.json', current_data, save_every=10, except_if_exists=True)

## Filter jobs and extract technologies

In [13]:
import tqdm
import openai


def generate_chatgpt_response(job_item):
    """
    Generate response using chatgpt
    :param job_item: job information
    :return: response from chatgpt
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content":
             """Bonjour. je vais te donner des offres d'emploi linkedin de développement mobile. 
            Je veux que tu me dises s'il s'agit d'une offre pour du développement mobile natif (android ou ios), 
            sur framework hybride (ex flutter, react-native), si c'est du no-code (flutterflow, adalo, etc...), 
            si ce n'est pas précisé, 
            si plusieurs technologies sont possibles (natif ou hybride ou no code)
            ou si c'est une offre qui ne concerne pas le développement mobile.
            Ta réponse doit être au format suivant: {type_developpement}: {langages_et_frameworks_de_l'offre}
            ou {type_developpement} est parmis ces mots: hybrid, native, nocode, unspecified, multiple, irrelevant.
            et {langages_et_frameworks_de_l'offre} est une liste de langage ou frameworks de développement mobile UNIQUEMENT
            à utiliser dans l'offre. (ex java, kotlin, swift, swiftui, flutter, react-native, xamarin, ionic, cordova, flutterflow...)
            mais pas de PHP, python, C#, C++, etc...
            Fait attention à bien classer ce qui n'est pas du développement mobile (développement web et logiciel) comme irrelevant.
            Voici trois exemples de réponses:
            \n hybrid: flutter, react-native
            \n native: kotlin, swift
            \n irrelevant
            """},
            {"role": "assistant",
             "content": "D'accord, je suis prêt. Vas-y, envoie-moi les offres d'emploi."},
            {"role": "user", "content": str(job_item)}
        ])
    return response.choices[0].message.content


def get_all_job_type(jobs_data, save_path, current_data={}, save_every=10, except_if_exists=True):
    """
    Loop over job ids, extract job type and save to a json file
    :param jobs_data: dictionary of job information
    :param save_path: path to save the json file
    :param current_data: dictionary of current job information
    :param save_every: save every n items
    :param except_if_exists: except if job id exists in current data, or overwrite
    """
    job_type_dict = {}
    for i, job_item in enumerate(tqdm.tqdm(jobs_data.items())):
        job_id, job_data = job_item
        if except_if_exists and job_id in current_data and current_data[job_id] is not None:
            continue
        else:
            try:
                job_data['type'] = generate_chatgpt_response(job_item)
                job_type_dict[job_id] = job_data
            except Exception as e:
                print(f"Error processing job ID {job_id}: {e}")
        if (i + 1) % save_every == 0:
            checkpoint_save(job_type_dict, save_path)
            job_type_dict.clear()
    if job_type_dict:
        checkpoint_save(job_type_dict, save_path)

### Remove null from data 
In the previous step, we have extracted the data from the job description.\ The null values correspond to the job descriptions that are not available.

We kept the null values because we wanted to remember the job ids that we could not retrieve the job description from, in case we had to loop over the data again. The algorithm as a parameter "except_if_exists" that check if the job id is already in the data if set to True. If it is, we skip the job id.


In [None]:
import json

with open('scrapped_data/jobs_data_1.json', encoding='utf-8') as f:
    data = json.load(f)

filtered_data = {}
for key, value in data.items():
    if value is not None:
        filtered_data[key] = value

print(f'Original data: {len(data)}')
print(f'Filtered data: {len(filtered_data)}')
with open('scrapped_data/jobs_data_1_filtered.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=4)

### Filter the data and Extract information with OpenAI API

In [14]:
import json

data = json.load(open("scrapped_data/jobs_data_1_filtered.json", 'r', encoding='utf-8'))

save_path = 'scrapped_data/data_gpt_2.json'

try :
    current_data = json.load(open(save_path, 'r', encoding='utf-8'))
except:
    current_data = {}

get_all_job_type(data, save_path, current_data, save_every=10, except_if_exists=True)

  0%|          | 0/800 [00:00<?, ?it/s]

100%|██████████| 800/800 [10:55<00:00,  1.22it/s] 


## Format the data to the desired format 

In [None]:
import json
with open('scrapped_data/data_gpt_2.json', encoding='utf-8') as f:
    datagpt = json.load(f)

# dictionnaire des mots clés pour chaque framework     
keywords_dict = {
    'flutter': ['flutter', 'dart'],
    'react-native': ['react native', 'react-native', 'reactnative'],
    'xamarin': ['xamarin'],
    'ionic': ['ionic'],
    'cordova': ['cordova'],
    'kotlin': ['kotlin'],
    'swift': ['swift'],
    'objective-c': ['objective-c', 'objective c', 'objective_c', 'objectivec'],
    'java': ['java.', 'java ', 'java,', 'java;', 'java:', 'java(', 'java)', 'java?', 'java!', 'java-', 'java_', 'java*', 'java/', 'java\\'],
}
filtered_datagpt = {}
types = {}
for key, value in datagpt.items():
    
    if value['type'] != 'irrelevant':
        filtered_datagpt[key] = value
        
    item_type = value['type'].split(':')[0]
    item_frameworks = ""
    if ':' in value['type']:
        item_frameworks = value['type'].split(':')[1]
    if item_type in types:
        types[item_type]['count'] += 1
    else:
        types[item_type] = {}
        types[item_type]['count'] = 1
    if item_type == 'irrelevant':
        continue    
    for framework, keywords in keywords_dict.items():
        for keyword in keywords:
            if keyword in item_frameworks.lower():
                if framework in types[item_type]:
                    types[item_type][framework] += 1
                else:
                    types[item_type][framework] = 1
                break
    
print(f'Original data: {len(datagpt)}')
print(f'Filtered data: {len(filtered_datagpt)}')
print(types)

with open('scrapped_data/types_2.json', 'w', encoding='utf-8') as f:
    json.dump(types, f, ensure_ascii=False, indent=4)