# 2. Data preprocessing & Modeling

### Imports

In [1]:
import re
import glob
import json
import pandas as pd
import numpy as np
# import tensorflow as tf
import torch
import evaluate
from datetime import datetime
from tqdm.auto import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TFAutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from datasets import Dataset
from sklearn.model_selection import train_test_split

tqdm.pandas()
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Load data

In [2]:
SEED = 42
MAX_LEN = None

paths = glob.glob('./pole_emploi/*.json')
results = []

for path in paths:
    with open(path, 'r', encoding='utf-8') as f:
        results += json.load(f)

df = pd.json_normalize(results)

if MAX_LEN:
    df = df.sample(MAX_LEN, random_state=SEED)

print(df.shape)
df.head(3)

(82734, 60)


Unnamed: 0,id,intitule,description,dateCreation,dateActualisation,romeCode,romeLibelle,appellationlibelle,typeContrat,typeContratLibelle,natureContrat,experienceExige,experienceLibelle,competences,dureeTravailLibelle,dureeTravailLibelleConverti,alternance,nombrePostes,accessibleTH,qualificationCode,qualificationLibelle,secteurActivite,secteurActiviteLibelle,qualitesProfessionnelles,offresManqueCandidats,lieuTravail.libelle,lieuTravail.latitude,lieuTravail.longitude,lieuTravail.codePostal,lieuTravail.commune,entreprise.nom,entreprise.description,entreprise.url,entreprise.entrepriseAdaptee,salaire.libelle,contact.nom,contact.coordonnees1,contact.courriel,origineOffre.origine,origineOffre.urlOrigine,entreprise.logo,salaire.commentaire,salaire.complement1,contact.coordonnees2,contact.coordonnees3,agence.courriel,langues,permis,deplacementCode,deplacementLibelle,formations,complementExercice,salaire.complement2,experienceCommentaire,contact.urlPostulation,agence.telephone,contact.telephone,contact.commentaire,conditionExercice,origineOffre.partenaires
0,142ZHCP,Technicien / Technicienne QSE (H/F),"PAPIERS A PAVIOT, société familiale fabriquant...",2022-10-24T15:22:24.000Z,2022-10-24T15:22:25.000Z,H1502,Management et ingénierie qualité industrielle,Adjoint(e) au responsable QSE en industrie,CDI,Contrat à durée indéterminée,Contrat travail,D,Débutant accepté,"[{'code': '114647', 'libelle': 'Analyser les n...",37H Horaires normaux,Temps plein,False,1,False,8,Agent de maîtrise,17,Fabrication d'autres articles en papier ou en ...,"[{'libelle': 'Sens de la communication', 'desc...",False,69 - CORBAS,45.666017,4.909309,69960,69273,PAPIERS A PAVIOT,PME familiale bien implantée sur le marché fab...,http://www.papiers-paviot.fr/entreprise-ressou...,False,"Mensuel de 2100,00 Euros à 2500,00 Euros sur 1...",PAPIERS A PAVIOT - Mme Maud DIEMER,"Pour postuler, utiliser le lien suivant : http...","Pour postuler, utiliser le lien suivant : http...",1,https://candidat.pole-emploi.fr/offres/recherc...,,,,,,,,,,,,,,,,,,,,
1,142ZHCK,Assistant pre-paye (H/F),Vous souhaitez rejoindre une entreprise famili...,2022-10-24T15:22:19.000Z,2022-10-25T08:28:25.000Z,M1501,Assistanat en ressources humaines,Assistant / Assistante de gestion en ressource...,CDI,Contrat à durée indéterminée,Contrat travail,E,6 mois,"[{'code': '100343', 'libelle': 'Législation so...",39H Horaires normaux,Temps plein,False,1,False,6,Employé qualifié,49,Transports routiers de fret interurbains,"[{'libelle': 'Réactivité', 'description': 'Cap...",False,42 - LA FOUILLOUSE,45.501106,4.31567,42480,42097,LTR,Vous souhaitez rejoindre une entreprise famili...,http://www.transport-ltr.com/carrieres/opportu...,False,"Mensuel de 1950,00 Euros à 2600,00 Euros sur 1...",LTR - Mme RECRUTEMENT SERVICE,"Pour postuler, utiliser le lien suivant : http...","Pour postuler, utiliser le lien suivant : http...",1,https://candidat.pole-emploi.fr/offres/recherc...,https://entreprise.pole-emploi.fr/static/img/l...,,,,,,,,,,,,,,,,,,,
2,142ZHCJ,Cuisinier / Cuisinière,Notre établissement recherche 2 Cuisinier(e)s ...,2022-10-24T15:22:18.000Z,2022-10-25T11:37:17.000Z,G1602,Personnel de cuisine,Cuisinier / Cuisinière,CDI,Contrat à durée indéterminée,Contrat travail,E,1 an,"[{'code': '104207', 'libelle': 'Préparer les v...",39H Horaires normaux,Temps plein,False,1,False,6,Employé qualifié,56,Restauration traditionnelle,,False,64 - BIARRITZ,43.47144,-1.555081,64200,64122,OCTOPUS,,,False,,Pôle Emploi BIARRITZ,8 ter RUE BORDE D ANDRE,,1,https://candidat.pole-emploi.fr/offres/recherc...,,1900 à 2000 euros net,Autre,64200 BIARRITZ,"Pour postuler, utiliser le lien suivant : http...","Pour postuler, utiliser le lien suivant : http...",,,,,,,,,,,,,,


### Data preprocessing
Clean nulls, format date attributes, drop duplicated jobs.

In [3]:
dfp = df.copy()

dfp.dropna(subset=['id', 'dateCreation',
                   'dateActualisation', 'description', 'intitule'],
           inplace=True)

for col in ['dateCreation', 'dateActualisation']:
    dfp[col] = pd.to_datetime(dfp[col], format='%Y-%m-%dT%H:%M:%S.%fZ')
    
dfp.sort_values(['dateActualisation'], ascending=False, inplace=True)

dfp.drop_duplicates(subset=['id'], keep='first', inplace=True)
dfp.drop_duplicates(subset=['description'], keep='first', inplace=True)
dfp.drop_duplicates(subset=['intitule'], keep='first', inplace=True)

dfp = dfp[['id', 'dateActualisation', 'romeCode', 'intitule', 'description']] \
         .reset_index(drop=True) \
         .copy()

print(dfp.shape)
dfp.head(3)

(33778, 5)


Unnamed: 0,id,dateActualisation,romeCode,intitule,description
0,142NPSB,2022-10-27 08:06:02,G1603,Employé de laboratoire de préparation de pizza...,Nous recherchons un employé polyvalent (H/F) p...
1,142NTLG,2022-10-27 08:00:35,G1605,Plongeur / Plongeuse en restauration,Vous avez une capacité d'adaptation et un rela...
2,142NPQB,2022-10-27 08:00:11,H2913,soudeur métal sur matériel agricole (H/F),Soudure sur des ensembles en acier non alliés ...


### Data augmentation (+50% incorrect pairs)
Mix correct data with incorrect job-description pairs for future classifier training.

In [4]:
dfa = dfp.copy()

dfa_left = dfa.copy()
dfa_right = dfa.copy()

agg = dfa.groupby(['romeCode', 'intitule']).last().reset_index()[['romeCode', 'intitule']]
rome_job_mapper = agg.set_index('romeCode').stack().groupby('romeCode').apply(list).to_dict()
romes = list(rome_job_mapper.keys())

def find_fake_job(row):
    choices = [x for x in romes if x != row['romeCode']]
    choice = np.random.choice(choices)
    fake_jobs = rome_job_mapper[choice]
    return np.random.choice(fake_jobs)

dfa_right['intitule'] = dfa_right.progress_apply(find_fake_job, axis=1)

dfa_left['label'] = 0
dfa_right['label'] = 2

dfa = pd.concat([dfa_left, dfa_right], ignore_index=True)
print(dfa.shape)
dfa.groupby(['label']).size()

  0%|          | 0/33778 [00:00<?, ?it/s]

(67556, 6)


label
0    33778
2    33778
dtype: int64

### Dataset generation

In [5]:
model_name = "valhalla/distilbart-mnli-12-1"
model_name = "BaptisteDoyen/camembert-base-xnli"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to('cuda')

In [6]:
SEED = 42

dff = dfa.copy()

dff['intitule'] = dff['intitule'].map(lambda x: f'Ce texte est à propos de {x}')
dff['description'] = dff['description'].map(lambda x: x[:925])

ds = Dataset.from_dict(dff[['description', 'intitule', 'label']].to_dict('list'))
ds = ds.shuffle(seed=SEED)
ds = ds.train_test_split(test_size=0.33)

def tokenize_sequence_pairs(row):
    return tokenizer(row['description'],
                     row['intitule'],
                     padding='max_length',
                     truncation=True)

ds = ds.map(tokenize_sequence_pairs)
ds = ds.remove_columns(["description", 'intitule'])

ds

  0%|          | 0/45262 [00:00<?, ?ex/s]

  0%|          | 0/22294 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 45262
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 22294
    })
})

### Model training

In [7]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    "Camembert-Jobs",
    report_to="none",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    # gradient_accumulation_steps=4,
    # gradient_checkpointing=True
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    compute_metrics=compute_metrics
)

In [10]:
trainer.train("./Camembert-Jobs/checkpoint-2500/")

Loading model from ./Camembert-Jobs/checkpoint-2500/.
***** Running training *****
  Num examples = 45262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16974
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 2500
  Will skip the first 0 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/2500 [00:00<?, ?it/s]

Step,Training Loss
2600,0.167
2700,0.2206
2800,0.2368
2900,0.2117
3000,0.2114
3100,0.1774
3200,0.2118
3300,0.1556
3400,0.1889
3500,0.1897


Saving model checkpoint to Camembert-Jobs\checkpoint-3000
Configuration saved in Camembert-Jobs\checkpoint-3000\config.json
Model weights saved in Camembert-Jobs\checkpoint-3000\pytorch_model.bin
Saving model checkpoint to Camembert-Jobs\checkpoint-3500
Configuration saved in Camembert-Jobs\checkpoint-3500\config.json
Model weights saved in Camembert-Jobs\checkpoint-3500\pytorch_model.bin
Saving model checkpoint to Camembert-Jobs\checkpoint-4000
Configuration saved in Camembert-Jobs\checkpoint-4000\config.json
Model weights saved in Camembert-Jobs\checkpoint-4000\pytorch_model.bin
Saving model checkpoint to Camembert-Jobs\checkpoint-4500
Configuration saved in Camembert-Jobs\checkpoint-4500\config.json
Model weights saved in Camembert-Jobs\checkpoint-4500\pytorch_model.bin
Saving model checkpoint to Camembert-Jobs\checkpoint-5000
Configuration saved in Camembert-Jobs\checkpoint-5000\config.json
Model weights saved in Camembert-Jobs\checkpoint-5000\pytorch_model.bin
Saving model checkpo

TrainOutput(global_step=16974, training_loss=0.09446865017512135, metrics={'train_runtime': 26708.8874, 'train_samples_per_second': 5.084, 'train_steps_per_second': 0.636, 'total_flos': 3.600286140139315e+16, 'train_loss': 0.09446865017512135, 'epoch': 3.0})

### Model evaluation

In [11]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 22294
  Batch size = 8


{'eval_loss': 0.18959441781044006,
 'eval_accuracy': 0.966179241051404,
 'eval_runtime': 748.0063,
 'eval_samples_per_second': 29.805,
 'eval_steps_per_second': 3.726,
 'epoch': 3.0}

In [12]:
trainer.save_model('./Camembert-Jobs/final')

Saving model checkpoint to ./Camembert-Jobs/final
Configuration saved in ./Camembert-Jobs/final\config.json
Model weights saved in ./Camembert-Jobs/final\pytorch_model.bin


In [13]:
best_model = AutoModelForSequenceClassification.from_pretrained('./Camembert-Jobs/final/')

loading configuration file ./Camembert-Jobs/final/config.json
Model config CamembertConfig {
  "_name_or_path": "./Camembert-Jobs/final/",
  "architectures": [
    "CamembertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "camembert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1"

### Using the model through a zero shot classification pipeline

In [52]:
# job = "Vous aurez pour: mission d'intervenir au sein du domicile des bénéficiaires pour les accompagner dans les actes essentiels de leur vie quotidienne et leur offrir un soutien social leur permettant un maintien à domicile. \n\nMissions :\n-\tEntretenir le cadre de vie du bénéficiaire \n-\tFaire les courses, préparer et aider à la prise de repas\n-\tAccompagner dans les activités de loisirs et de la vie sociale.\n\nL'humain étant au cœur de nos préoccupations, nous vous garantissons les avantages suivants :  \n-\tUn planning co-construit et adapté \n-\tUne sectorisation près de chez vous \n-\tDes compléments de rémunération : prise en charge +50% titre de transport, +25% majoration dimanches et jours fériés.\n-\tUne mutuelle d'entreprise \n-\tUn dispositif de pourcentage patronal à l'action logement  \n-\tUn parcours d'intégration personnalisé"
zsc = pipeline('zero-shot-classification', model=best_model, tokenizer=tokenizer)
weighted_scores = zsc(
    job,
    candidate_labels=['ingénieur cybersécurité', 'data scientist', 'développeur web', 'femme de ménage', 'big data', 'cybersécurité', 'intelligence des données'],
    hypothesis_template='Ce texte est à propose de {}')
scores = zsc(
    job,
    candidate_labels=['ingénieur cybersécurité', 'data scientist', 'développeur web', 'femme de ménage', 'big data', 'cybersécurité', 'intelligence des données'],
    hypothesis_template='Ce texte est à propose de {}',
    multi_label=True)

del weighted_scores['sequence']
del scores['sequence']

print('Weighted scores')
for k, v in zip(weighted_scores['labels'], weighted_scores['scores']):
    print(f'   - {k}: {v:.2f}')

print('\nIndividual scores')
for k, v in zip(scores['labels'], scores['scores']):
    print(f'   - {k}: {v:.2f}')

Weighted scores
   - cybersécurité: 0.31
   - ingénieur cybersécurité: 0.30
   - big data: 0.23
   - data scientist: 0.16
   - intelligence des données: 0.00
   - développeur web: 0.00
   - femme de ménage: 0.00

Individual scores
   - ingénieur cybersécurité: 1.00
   - cybersécurité: 1.00
   - big data: 1.00
   - data scientist: 0.99
   - intelligence des données: 0.01
   - développeur web: 0.00
   - femme de ménage: 0.00
