In [17]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [19]:
# 1. Import des librairies
import re
import spacy
import pandas as pd

# Charger le modèle de langue français ou anglais selon ton corpus
# ("fr_core_news_md" ou "en_core_web_md")
nlp = spacy.load("en_core_web_md")

In [25]:
# 2. Import de dataset

from google.colab import drive
drive.mount('/content/drive')
job_description = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/job_des/job_title_des.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# 3. Prétraitement du texte and 4. Fonctions d'extraction

def extract_title(text):
    match = re.search(r"(?i)(Data Scientist|Engineer|Developer|Manager|Analyst)", text)
    return match.group(0) if match else None


def extract_company(text):
    match = re.search(r"at ([A-Z][A-Za-z0-9& ]+)", text)
    return match.group(1).strip() if match else None


def extract_location(text):
    match = re.search(r"in ([A-Z][a-zA-Z ]+)", text)
    return match.group(1).strip() if match else None


def extract_contract(text):
    match = re.search(r"(CDI|CDD|Internship|Stage|Freelance)", text, re.IGNORECASE)
    return match.group(1).upper() if match else None


def extract_skills(text):
    skill_list = ["Python", "SQL", "Machine Learning", "Deep Learning", "NLP", "Java", "C++"]
    found = [skill for skill in skill_list if re.search(skill, text, re.IGNORECASE)]
    return found


def extract_experience(text):
    match = re.search(r"(\d+\s+years?|\d+ ans)", text)
    return match.group(1) if match else None


def extract_education(text):
    match = re.search(r"(Bachelor|Master|PhD|Bac\+\d)", text, re.IGNORECASE)
    return match.group(1) if match else None


def extract_responsibilities(text):
    lines = text.split("\n")
    responsibilities = []
    capture = False
    for line in lines:
        if "Responsibilities" in line:
            capture = True
            continue
        if capture:
            if line.strip() == "" or re.search(r"Requirements", line):
                break
            responsibilities.append(line.strip("- "))
    return responsibilities


In [22]:
# Traitement de chaque job description
parsed_jobs_list = []
for _, row in job_description.iterrows():
    description = str(row['Job Description'])
    parsed_job = {
        "Titre": extract_title(description),
        "Entreprise": extract_company(description),
        "Lieu": extract_location(description),
        "Contrat": extract_contract(description),
        "Compétences": extract_skills(description),
        "Expérience": extract_experience(description),
        "Niveau": extract_education(description),
        "Missions": extract_responsibilities(description)
    }
    parsed_jobs_list.append(parsed_job)

# Conversion en DataFrame final
parsed_jobs_df = pd.DataFrame(parsed_jobs_list)

# Affichage des 5 premières lignes
print(parsed_jobs_df.head())

# Optionnel : sauvegarde
parsed_jobs_df.to_csv("job_descriptions_parsed.csv", index=False)
print("✅ Extraction terminée et sauvegardée dans job_descriptions_parsed.csv")

            Titre Entreprise             Lieu Contrat  \
0       developer       None             None    None   
1       Developer       None  API development    None   
2  Data Scientist       None  India Bangalore    None   
3        engineer       None             None    None   
4        engineer       None             None    None   

                                         Compétences Expérience Niveau  \
0                                              [C++]     1 year   None   
1                                 [Python, SQL, C++]       None   None   
2  [Python, Machine Learning, Deep Learning, Java...    3 years   None   
3                                              [C++]       None   None   
4                                        [Java, C++]       None   None   

  Missions  
0       []  
1       []  
2       []  
3       []  
4       []  
✅ Extraction terminée et sauvegardée dans job_descriptions_parsed.csv


In [23]:
# 7. Sauvegarde en fichier JSON
import json
with open("parsed_job.json", "w", encoding="utf-8") as f:
    json.dump(parsed_job, f, ensure_ascii=False, indent=4)


print("✅ Résultats enregistrés dans parsed_job.json")

✅ Résultats enregistrés dans parsed_job.json


In [24]:
# 8. Télécharger le fichier JSON
from google.colab import files
files.download("parsed_job.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>