In [1]:
import pandas as pd
import numpy as np
import json
from datasketch import MinHash, MinHashLSH
from nltk import ngrams

In [2]:
# read latest datafiller content
df_content = pd.read_csv("../log-analysis/df_suggestion_qualification-1712.csv")
df_content.rename( columns={'Unnamed: 0':'idx'}, inplace=True)
df_content["Candidat"] = df_content["Candidat"].apply(lambda c : str(c).strip().lower())
df_content["Entrée"] = df_content["Entrée"].apply(lambda c : str(c).strip().lower())

In [3]:
# read candidate files
candidates_content = pd.read_csv("jrv.csv")
candidates_content.rename( columns={'Unnamed: 0':'idx'}, inplace=True)
candidates_content.dropna(subset=['idx'], inplace=True)
candidates_content['idx'] = candidates_content.idx.astype(int)
candidates_content["Candidat"] = candidates_content["Candidat"].apply(lambda c : str(c).strip().lower())
candidates_content["Entrée"] = candidates_content["Entrée"].apply(lambda c : str(c).strip().lower())

In [4]:
candidates_to_keep = candidates_content[candidates_content["Conserver"] > 0]

In [5]:
def exists_in_latest(entry):
    entree = entry['Entrée']
    candidat = entry['Candidat']
    idx = entry.idx
    
    return df_content[df_content['Candidat'] == candidat].shape[0] > 0    

In [6]:
# for each candidate to keep, we ensure it didn't change in the latest Datafiller content (spelling, typo...)
candidates_found = candidates_to_keep.apply(lambda x : exists_in_latest(x), axis=1)

In [7]:
no_problem = candidates_to_keep[candidates_found]
to_correct = candidates_to_keep[~candidates_found]

In [8]:
lsh = MinHashLSH(threshold=0.8, num_perm=128)

minhashes = {}
for c, i in enumerate(df_content['Candidat']):
    minhash = MinHash(num_perm=128)
    for d in ngrams(i, 3):
        minhash.update("".join(d).encode('utf-8'))
    lsh.insert(c, minhash)
    minhashes[c] = minhash

In [9]:
def search_candidate(doc):
    minhash = MinHash(num_perm=128)
    for d in ngrams(doc, 3):
        minhash.update("".join(d).encode('utf-8'))

    search = lsh.query(minhash)

    if (len(search) == 1):
        return df_content.iloc[search[0]]['Candidat']
    elif (len(search) > 1):
        print('Too many results : ' + doc)
        return np.NaN
    else :
        print('Not found : ' + doc)
        return np.NaN

In [10]:
found = to_correct['Candidat'].apply(lambda x : search_candidate(x))

Too many results : à combien de congés ai-je droit
Too many results : à combien de congés payés ai-je droit
Too many results : accord rupture conventionnelle collective
Not found : acquisition congés payés pendant arrêt longue maladie
Not found : acquisition congés payés pendant arrêt maladie
Too many results : aménagement temps de travail
Not found : arret maladie puis congés payés
Not found : assistance entretien sanction disciplinaire
Not found : attestation pole emploi délai
Not found : calcul ancienneté et période de suspension
Not found : calcul ancienneté indemnité de licenciement
Not found : calcul ancienneté mois incomplet
Not found : calcul ancienneté passage cdd à cdi
Not found : calcul ancienneté professionnelle
Not found : calcul ancienneté rupture conventionnelle
Not found : calcul ancienneté succession cdd
Not found : calcul congé payé année incomplète
Not found : calcul congé payé et accident du travail
Not found : calcul congé payé et activité partielle
Not found : cal

In [11]:
corrected_auto = found.dropna()

In [12]:
not_corrected = to_correct[found.isna()]

In [13]:
# for now we ignore correction of 54
df_corrected_entities = pd.concat([no_problem.Candidat, corrected_auto, not_corrected.Candidat])

In [14]:
df_corrected_entities.to_csv("datafiller_suggestions.csv")

  """Entry point for launching an IPython kernel.
