In [1]:
from platform import python_version
print(python_version())

3.6.8


# Imports

In [261]:
import inspect
import logging
import os
import sys
import io
import re
import pandas as pd
import numpy as np
import nltk
import yaml
import dateparser

from matplotlib import pyplot as plt
import matplotlib.cm as cm

# PDFMiner
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

# Collections
import collections
from collections import Counter

# NLTK
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.cluster import KMeansClusterer, euclidean_distance

# Gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models import word2vec
import gensim

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# Spacy
import spacy
nlp = spacy.load('fr_core_news_sm')
# creuser la présence de mot anglais

# Import for progress bar
from tqdm import tqdm


currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

# Vars & Paths

In [262]:
AO_directory = "data/input/AO"
DC_directory = "data/input/DC"

AO_output_directory = "data/output/AO"
AO_save_file = "AO_save_file.csv"

DC_output_directory = "data/output/DC"
DC_save_file = "DC_save_file.csv"

confs_path = "confs/config.yaml"
confs = yaml.load(open(confs_path, encoding='utf8'))

  # This is added back by InteractiveShellApp.init_path()


In [263]:
# Delete french stopwords
isDeleteFrenchStopwords = False

# Delete english stopwords
isDeleteEnglishStopwords = False 

#lower the word
isLower = True

#lemmatize the word
isLemmatize = False

perplexity=30

possibilities = [False, True]

perplexityPossibilities=[30, 50]

In [264]:
# Get a dict of all skills (and alias)

def skills_alias_dict():
    skills_alias_dict = dict()

    for extractor, items_of_interest in confs['extractors'].items():
        
        # Translate the YAML file into a dic : potential_skills_dict
        for skill_input in items_of_interest:

            # Format list inputs
            if type(skill_input) is list and len(skill_input) >= 1:
                skills_alias_dict[skill_input[0]] = skill_input

            # Format string inputs
            elif type(skill_input) is str:
                skills_alias_dict[skill_input] = [skill_input]
            else:
                logging.warn('Unknown skill listing type: {}. Please format as either a single string or a list of strings'
                             ''.format(skill_input))

    return skills_alias_dict


skills_alias_dict = skills_alias_dict()

# Extract & Convert from PDF

In [265]:
def convert_pdf(path):
    
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    
    with open(path, "rb") as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()

        fp.close()
        device.close()
        retstr.close()
        
    return text



def extract(directory):

    # Reference variables
    file_agg = list()

    # Create list of candidate files
    for root, subdirs, files in os.walk(directory):
        folder_files = map(lambda x: os.path.join(root, x), files)
        file_agg.extend(folder_files)

    # Convert list to a pandas DataFrame
    observations = pd.DataFrame(data=file_agg, columns=['file_path'])
    observations['extension'] = observations['file_path'].apply(lambda x: os.path.splitext(x)[1])
    observations['text'] = observations['file_path'].apply(convert_pdf)
    
    return observations


In [294]:
AO_observations = extract(AO_directory)
DC_observations = extract(DC_directory)

# Save Extracted & Converted files into CSV

In [206]:
def save(observations, output_directory, save_file):

    output_path = os.path.join(output_directory, save_file)
    observations.to_csv(path_or_buf=output_path, index_label='index')
    
    return

In [207]:
#save(AO_observations, AO_output_directory, AO_save_file)
save(DC_observations, DC_output_directory, DC_save_file)

# Start preprocessing DC

In [293]:
def reduceDimensionList (my_list) :
    
    my_list = [item for sublist in my_list for item in sublist]
    
    return my_list

def getColumn(start_index, end_index, sentences):

        if start_index != 0:
            start_index = start_index + 1
        
        if end_index == 0 :
            text = sentences[start_index:]
        else : 
            text = sentences[start_index:end_index]
                        
        return text

In [283]:
def preprocessingDC(observations):  
    
    observations['sentences'] = ""
    observations['sentences2'] = ""
    observations['skills'] = ""
    observations['exp'] = ""
    observations['formation'] = ""
    
        
    for index, row in tqdm(observations.iterrows()):
        
        # Tokenization
        sentences = []        
        del_1 = "JEMS - \d.*.com"
        del_2 = "Votre contact JEMS .* \n"
        del_3 = "\n"
        #del_4 = "\/"
        delimiters = del_1 + '|' + del_2 + '|' + del_3# + '|' + del_4
        paragraphs = [p for p in re.split(delimiters, row["text"]) if p]
        for paragraph in paragraphs:
            sentences += sent_tokenize(paragraph)
        observations.loc[index, 'sentences'] = sentences
        
        
        # Get index
        list_index_synthese = ['SYNTHESE DE COMPETENCES', 'PRINCIPALES COMPETENCES TECHNIQUES' ]
        list_index_experience = ['EXPERIENCES PROFESSIONNELLES', 'EXPERIENCE PROFESSIONNELLE']
        list_index_formation = ['FORMATIONS', 'FORMATION', "DIPLOMES & FORMATIONS", "FORMATION / CERTIFICATIONS", "FORMATION/ CERTIFICATIONS", "FORMATION/ CERTIFICATIONSNELLES"]
        index_synthese = np.nan
        index_experience = np.nan
        index_formation = np.nan
        for i,e in enumerate(sentences):
            if e in list_index_synthese:
                index_synthese = i
            elif e in list_index_experience:
                index_experience = i
            elif e in list_index_formation:
                index_formation = i
                
        if np.isnan(index_synthese) or np.isnan(index_experience) or np.isnan(index_formation):
            raise Exception ("Une des parties principale du document n'a pas été reconnue")
        
        # Get Section By Section
        dictionnaryIndex = {
            "skills" : index_synthese,
            "exp" : index_experience,
            "formation" : index_formation}       
        listofTuples = sorted(dictionnaryIndex.items(), key=lambda x: x[1])
        for i in range(len(listofTuples)):
            k = listofTuples[i][0]
            v = int(listofTuples[i][1])
            
            v2 = 0
            if i < len(listofTuples) - 1:
                k2 = listofTuples[i+1][0]
                v2 = int(listofTuples[i+1][1])
            
            
            observations.loc[index, k] = getColumn(v, v2, sentences)
            
        observations.loc[index, "sentences2"] = getColumn(0, 0, sentences)
        
    return observations

In [285]:
def getPreciseDate(text) : 

    text = text.lower()
    text = re.split("[^A-Za-z0-9é]+", text)
    
    year_pattern1 = '((19|20)\d{2})'
    year_pattern2 = '(\d{2})'

    month_pattern1 = "(0[1-9]|10|11|12)"
    month_pattern2 = "(jan|fev|fév|mar|avr|mai|juin|juil|aout|sep|oct|nov|dec|déc)"
    month_pattern3 = "(janvier|février|fevrier|mars|avril|mai|juin|juillet|aout|septembre|octobre|novembre|decembre|décembre) "
    month_pattern = "(" + month_pattern1 + "|" + month_pattern2 + "|" + month_pattern3 + ")"

    results = []

    # YYYY -> YYYY
    for i in range(len(text)):
        if (i != 0) and (re.match(year_pattern1, text[i])) and (re.match(year_pattern1, text[i - 1])) :
            results.append("01/" + "01/" + text[i - 1])
            results.append("01/" + "01/" + text[i])
            
    # 01 - 08/2018 or Jan - Sep/2018
    if len(results) == 0:
        for i in range(len(text)):   
            if (i != 0) and (i != 1) and (re.match(year_pattern1, text[i])) and (re.match(month_pattern, text[i - 1])) and (re.match(month_pattern, text[i - 2])):
                results.append("01/" + text[i - 2] + "/" + text[i])
                results.append("01/" + text[i - 1] + "/" + text[i])
            
    # 01/2018 -> 02/2018 or Jan/2018 -> Fév/2018
    if len(results) == 0:
        for i in range(len(text)):   
            if (i != 0) and (re.match(year_pattern1, text[i])) and (re.match(month_pattern, text[i - 1])) :
                results.append("01/" + text[i - 1] + "/" + text[i])

    # 2018 
    if len(results) == 0:
        for i in range(len(text)):   
            if (i == 0) and (re.match(year_pattern1, text[i])) :
                results.append("01/" + "01/" + text[i])



    if len(results) == 1 :
        results.append("Aujourd'hui")
        

    for i in range(len(results)) :
        results[i] = dateparser.parse(results[i])
    
    return results


def getKeyDate(sentences):
    
    keydate = []
    for sentence in sentences : 
        if len(getPreciseDate(sentence)) > 0 : 
            keydate.append(sentence)

    
    if len(keydate) <= 0 :
        raise Exception("No professionnal experience find")
            
    return keydate

In [292]:
def getAllInformations(keydate, df_):
    
    for i in range(len(keydate)) : 
        
        dateDebut = DC_observations.loc[DC, "exp"].index(keydate[i])
        if i != len(keydate) -1 :
            dateFin = DC_observations.loc[DC, "exp"].index(keydate[i + 1])
        else :
            dateFin = len(DC_observations.loc[DC, "exp"]) #- 1
             
        df_.loc[i, "DD"] = keydate[i]
        df_.loc[i, "iDD"] = dateDebut
    
        if i != len(keydate) - 1 :
            df_.loc[i, "DF"] = keydate[i + 1]
        else :
            df_.loc[i, "DF"] = "Aujourd'hui"        
        df_.loc[i, "iDF"] = dateFin
                       
        
        df_['Mission'] = df_['Mission'].astype(object)
        df_['MissionString'] = df_['MissionString'].astype(object)

        df_.at[i, "Mission"] = DC_observations.loc[DC, "exp"][dateDebut:dateFin]
        df_.at[i, "MissionString"] = " ".join(df_.at[i, "Mission"])


    for index, row in df_.iterrows():
    
        text = str(df_.loc[index, "MissionString"])
        
        # Get start date & end date
        mydates = getPreciseDate(text)       

        # Parse into real date
        df_.loc[index, "DD"] = mydates[0]
        df_.loc[index, "DF"] = mydates[1]

    df_['DD'] = pd.to_datetime(df_['DD'])
    df_['DF'] = pd.to_datetime(df_['DF'])
    df_['result'] = df_["DF"] - df_['DD']
    
    return df_

In [287]:
def term_count(string_to_search, term):
    try:
        term = r'\b' + re.escape(term) + r'\b'
        regular_expression = re.compile(term, re.IGNORECASE)
        result = re.findall(regular_expression, string_to_search)
        return len(result)
    except Exception:
        logging.error('Error occurred during regex search')
        return 0
    
def extract_skills(resume_text, extractor, items_of_interest):
    potential_skills_dict = dict()
    matched_skills = []

    # Translate the YAML file into a dic : potential_skills_dict
    for skill_input in items_of_interest:

        # Format list inputs
        if type(skill_input) is list and len(skill_input) >= 1:
            potential_skills_dict[skill_input[0]] = skill_input

        # Format string inputs
        elif type(skill_input) is str:
            potential_skills_dict[skill_input] = [skill_input]
        else:
            logging.warn('Unknown skill listing type: {}. Please format as either a single string or a list of strings'
                         ''.format(skill_input))

    for (skill_name, skill_alias_list) in potential_skills_dict.items():

        skill_matches = 0
        # Iterate through aliases
        for skill_alias in skill_alias_list:
            # Add the number of matches for each alias
            skill_matches += term_count(resume_text.lower(), skill_alias.lower())
            

        # If at least one alias is found, add skill name to set of skills
        if skill_matches > 0:
            matched_skills.append(skill_name)

    return matched_skills

def extract_fields(df):
    for extractor, items_of_interest in confs['extractors'].items():
        df_[extractor] = df_['MissionString'].apply(lambda x: extract_skills(x, extractor, items_of_interest))
    return df

In [288]:
def extract_skills_with_timestamp(df_):
    
    # Get all skills with xp 
    skills = {}
    for index, row in df_.iterrows():
        for col in df_.loc[:, "Experience":] :

            items = df_.at[index, col]
            if len(items) == 0 : 
                continue

            for item in items:
                if not item in skills : 
                    skills[item] = df_.at[index, "result"]
                else :
                    skills[item] += df_.at[index, "result"]
    
    for key, value in skills.items() :
        if value.days < (365 * 2) : 
            skills[key] = "Junior"
        elif  value.days < (365 * 5) :
            skills[key] = "Expérimenté"
        elif  value.days < (365 * 10) :
            skills[key] = "Senior"
        elif value.days >= (365 * 10) :
            skills[key] = "Expert"
        else :
            skills[key] = "erreur ?"
            
    return skills

In [289]:
DC_observations = preprocessingDC(DC_observations)

DC_observations["Skills"] = ""

for DC in range(len(DC_observations)) :
#for DC in range(17,18) :

    # Get all key date from the experiences
    keydate = getKeyDate(DC_observations.loc[DC, "exp"])

    # Create a tmp dataframe        
    df_ = pd.DataFrame(index=np.arange(len(keydate) -1), columns=["Mission", "MissionString",  "DD", "DF", "iDD", "iDF", "Skills"])
    df_ = df_.fillna(0) # with 0s rather than NaNs
    
    # Get informations about each missions
    df_ = getAllInformations(keydate, df_)

    # Extract skills for each missions
    df_ = extract_fields(df_)
    
    # Get all skills with xp 
    Skills = extract_skills_with_timestamp(df_)
    DC_observations.at[DC, "Skills"]  = Skills
    

In [290]:
DC_observations

Unnamed: 0,file_path,extension,text,sentences,sentences2,skills,exp,formation,Skills
0,data/input/DC\Dossier de Compétences JEMS Gr...,.pdf,1 \n\n \n\n \n\n \n\nSenior PMO \n\nS...,"[1, Senior PMO, SYNTHESE DE COMPETENCES, - Co...","[1, Senior PMO, SYNTHESE DE COMPETENCES, - Co...","[- Compétences fonctionnelles :, - Conduite ...",[06/2018 – 12/18 – Consultante – JEMS Group – ...,[❖ 2018 – Design thinking project management ...,"{'Chef de projet': 'Expérimenté', 'Swift': 'Ex..."
1,data/input/DC\Dossier de compétences de JEMS ...,.pdf,RHA \n\nConsultant BIG DATA \n\nSYNTHESE DE CO...,"[RHA, Consultant BIG DATA, SYNTHESE DE COMPETE...","[RHA, Consultant BIG DATA, SYNTHESE DE COMPETE...","[ Savoir-faire :, - Connaissance de l’écosys...","[Depuis Avril 2018 JEMS Datafactory, ---...",[Diplôme d’ingénieur en génie mathématique opt...,"{'Windows': 'Junior', 'Tableau': 'Junior', 'Py..."
2,data/input/DC\Dossier de Compétences JEMS - ...,.pdf,1 \n\n \n\n \n\nBSE \n\nConsultant Dat...,"[1, BSE, Consultant Data Sénior, Développement...","[1, BSE, Consultant Data Sénior, Développement...","[Scikit-learn, pandas , pyTorch/ fastai, Keras...",[ ...,"[2006, 1987, Master2 Mathématiques Fondamental...","{'DataEngineer': 'Expérimenté', 'Unix': 'Senio..."
3,data/input/DC\Dossier de compétences JEMS - ...,.pdf,\n\n \n \n\n \n\n \n\n \n\n \n\n \n \n\n \n\n...,"[Ingénieur d’études confirmé ASP.Net / C#, 7 a...","[Ingénieur d’études confirmé ASP.Net / C#, 7 a...","[Langages :, Outils / Framework :, C#, J2EE, C...",[Mai 2014 à Décembre 2018 – Préfecture de RABA...,[2014/2016 : MASTER Scientifique (Msc) Optio...,"{'Windows': 'Senior', 'SQL': 'Senior', 'C': 'S..."
4,data/input/DC\Dossier de compétences JEMS - ...,.pdf,1 \n\n \n\n \n\n \n\n. \n\nConsultant ...,"[1, ., Consultant Technico-Fonctionnel, SYNTHE...","[1, ., Consultant Technico-Fonctionnel, SYNTHE...","[❖ Finance, Finance de marché, Gestion d’acti...",[ AXA Investment Partners – Front office Analy...,[ IAE Gustave Eiffel - Gestion de portefeuille...,"{'Developer': 'Expérimenté', 'SQL': 'Senior', ..."
5,data/input/DC\Dossier de Compétences JEMS - A...,.pdf,1 \n\n \n\n \n\n \n\n \n \n\n \n\n \n\...,"[1, Développeur Fullstack, 6 années d’expérien...","[1, Développeur Fullstack, 6 années d’expérien...","[Métiers, Fonctionnelles, Etudes transverses f...","[Novembre 2017 – Mars 2019, Développeur Full S...",[2],"{'IOS': 'Expérimenté', 'Android': 'Expérimenté..."
6,data/input/DC\Dossier de Compétences JEMS - A...,.pdf,1 \n\n \n\n \n\nASA \n\nDATA SCIENTIST...,"[1, ASA, DATA SCIENTIST, SYNTHESE DE COMPETENC...","[1, ASA, DATA SCIENTIST, SYNTHESE DE COMPETENC...","[Domaines de compétences, Intelligence Artific...","[<IN-TEAM/ startup>, Domaine de compétences : ...",[ 2011-2013 : Diplôme national d’ingénieur...,"{'R': 'Expérimenté', 'Python': 'Expérimenté', ..."
7,data/input/DC\Dossier de Compétences JEMS - D...,.pdf,1 \n\n \n\n \n\nDK \n\nConsultant Big Data \n\...,"[1, DK, Consultant Big Data, SYNTHESE DE COMPE...","[1, DK, Consultant Big Data, SYNTHESE DE COMPE...","[❖ Compétences techniques :, - Programmation ...",[Février 2018 – Février 2019 : Data Scientist ...,"[CES, ❖, Janvier 2019:, Formation SSI / Cybers...","{'DataScientist': 'Expérimenté', 'SQL': 'Junio..."
8,data/input/DC\Dossier de Compétences JEMS - E...,.pdf,1 \n\n \n\n \n \n\n \n\n \n\n \n \n\n ...,"[1, E, Consultant Big Data, Ph.D, SYNTHESE DE ...","[1, E, Consultant Big Data, Ph.D, SYNTHESE DE ...","[ Compétences techniques, • Langages : Pytho...",[2019 – Consultant Big Data (Consultant Jems) ...,"[OBJECTIFS :, ➢ Définition du périmètre du Big...","{'Apache Spark': 'Junior', 'Scala': 'Junior', ..."
9,data/input/DC\Dossier de Compétences JEMS - E...,.pdf,1 \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n...,"[1, EZA, Consultant DevOps, SYNTHESE DE COMPET...","[1, EZA, Consultant DevOps, SYNTHESE DE COMPET...","[❖ Compétences techniques :, Système OS : ...",[Octobre 2018 – Janvier 2019 : Consultant DEV...,"[❖ 2018 – Formation DevOps, Institut-F2I, ❖ ...","{'Jenkins': 'Junior', 'Unix': 'Senior', 'C': '..."


# Start preprocessing AO

In [295]:
def preprocessingAO(observations):  
       
    observations['sentences'] = ""
    observations['sentences2'] = ""

    for index, row in tqdm(observations.iterrows()):
        
        # Tokenization
        sentences = []        
        del_1 = "\n"
        delimiters = del_1 
        paragraphs = [p for p in re.split(delimiters, row["text"]) if p]
        for paragraph in paragraphs:
            sentences += sent_tokenize(paragraph)
        observations.loc[index, "sentences"] = sentences
        
        observations.loc[index, "sentences2"] = getColumn(0, 0, sentences)

        
    return observations

In [296]:
AO_observations = preprocessingAO(AO_observations)

18it [00:00, 189.57it/s]


In [325]:
def getColumn2(sentences):
    
    sentences = reduceDimensionList(sentences.tolist())

    text = [word_tokenize(w) for w in sentences]

    my_text = []
    for sent in text :
        my_sent = []
        for word in sent :

            #lower the word
            if isLower:    
                word = word.lower()

            #lemmatize the word
            if isLemmatize:
                word = nlp(word)[0].lemma_ 

            # Delete french stopwords
            if isDeleteFrenchStopwords and (word in stopwords.words("french")):
                continue

            # Delete english stopwords
            if isDeleteEnglishStopwords and (word in stopwords.words("english")):
                continue

            #append (or not) the word to the "sentence"
            if len(word) > 0:
                my_sent.append(word)

        #append (or not) the sentence to the "text"
        if len(my_sent) > 0:
            my_text.append(my_sent)

    return my_text

In [326]:
result = getColumn2(AO_observations['sentences2'])

In [327]:
print(result)

[['contexte', 'détaillé', 'de', 'la', 'mission'], ['domaine'], ['environnement', 'technique', 'général'], ['dealing', 'service'], ['lz', 'minerva', '(', 'oms', ')', '/', 'sql/', 'oracle/', 'unix', 'shell/', 'fix/', 'cft/', 'mq', 'series/', 'vba'], ['nom', 'du', 'projet'], ['global', 'dealing', 'service', 'desk'], ['i/', 'activites', 'de', 'dealing', 'services'], ['le', 'dealing', 'services', 'est', 'une', 'offre', 'd', '’', 'outsourcing', 'de', 'tables', 'de', 'négociations', 'à', 'destination', 'de', 'la', 'clientèle'], ['«', 'buy-side', '»', ':', 'gestionnaires', 'd', '’', 'actifs', ',', '«', 'asset', 'owners', '»', ',', 'assureurs', 'et', 'banques', 'privées', '.'], ['basées', 'à', 'paris', ','], ['londres', ',', 'hong-kong', 'et', 'new-york', ',', 'les', 'équipes', 'des', 'tables', 'de', 'négociations', 'interviennent', 'sur', 'les'], ['marchés', 'régulés', ',', 'les', 'plateformes', 'alternatives', '(', 'mtfs', 'ou', '«', 'multi-trading', 'facilities', '»', ')', ',', '«', 'dark', 




In [297]:
AO_observations

Unnamed: 0,file_path,extension,text,sentences,sentences2
0,data/input/AO\AO JEMS group-2.pdf,.pdf,\n \nContexte détaillé de la Mission \n \nDom...,"[Contexte détaillé de la Mission, Domaine, Env...","[Contexte détaillé de la Mission, Domaine, Env..."
1,data/input/AO\Ao--05-07-2016-812632-001.pdf,.pdf,Document de consultation Appel d’offres\n\n \n...,"[Document de consultation Appel d’offres, Info...","[Document de consultation Appel d’offres, Info..."
2,data/input/AO\Ao--10-11-2015-496231-001.pdf,.pdf,Document de consultation Appel d’offres\n\nInf...,"[Document de consultation Appel d’offres, Info...","[Document de consultation Appel d’offres, Info..."
3,data/input/AO\Ao--20-06-2016-201696-001.pdf,.pdf,Document de consultation Appel d’offres\n\n \n...,"[Document de consultation Appel d’offres, Info...","[Document de consultation Appel d’offres, Info..."
4,data/input/AO\Ao--20-10-2011-361638-001.pdf,.pdf,Document de consultation Appel d’offres\n\nInf...,"[Document de consultation Appel d’offres, Info...","[Document de consultation Appel d’offres, Info..."
5,data/input/AO\Ao--25-02-2014-536631-001.pdf,.pdf,Document de consultation Appel d’offres\n\nInf...,"[Document de consultation Appel d’offres, Info...","[Document de consultation Appel d’offres, Info..."
6,data/input/AO\Ao--26-07-2016-352422-001.pdf,.pdf,Document de consultation Appel d’offres\n\n \n...,"[Document de consultation Appel d’offres, Info...","[Document de consultation Appel d’offres, Info..."
7,data/input/AO\AO-Br--25-03-2010-182482-001.pdf,.pdf,Document de consultation Appel d’offre\n\nInfo...,"[Document de consultation Appel d’offre, Infor...","[Document de consultation Appel d’offre, Infor..."
8,data/input/AO\BNPPAM_AO_2018-45992 - PMO for s...,.pdf,REQUEST FOR PROPOSAL/ APPEL D’OFFRE 2018-45992...,[REQUEST FOR PROPOSAL/ APPEL D’OFFRE 2018-4599...,[REQUEST FOR PROPOSAL/ APPEL D’OFFRE 2018-4599...
9,data/input/AO\BNPPAM_AO_2018-46103 - Senior AP...,.pdf,REQUEST FOR PROPOSAL/ APPEL D’OFFRE 2018-46103...,[REQUEST FOR PROPOSAL/ APPEL D’OFFRE 2018-4610...,[REQUEST FOR PROPOSAL/ APPEL D’OFFRE 2018-4610...


# Visualize text

In [11]:
def Display(observations, column, threshold) :

    allTexts = reduceDimensionList(observations[column].tolist())
    allTexts = reduceDimensionList(allTexts)
    
    
    # Display len of words
    test = [len(w) for w in allTexts]
    print("Display len of words : ")
    print(test)
    plt.hist(test)
    plt.show()


    # Write word with len > 15
    test = [w for w in allTexts if len(w)>15]
    if (len(test) > 0):
        print("Display words where len>15 : ")
        print(test)


    # Write & Display word frequency >= threshold
    test = Counter(allTexts)
    test = {x : test[x] for x in test if test[x] >= threshold}
    
    print("Display frequency of words : ")
    print(test)
    
    labels, values = zip(*test.items())
    indexes = np.arange(len(labels))
    width = 1
    plt.bar(indexes, values, width)
    plt.xticks(indexes + width * 0.5, labels)
    plt.show()

In [None]:
#Display(AO_observations, 'sentences2', 5)
#Display(DC_observations, 'skills', 4)
#Display(DC_observations, 'exp',20)
#Display(DC_observations, 'formation', 6)

# TSNE

In [None]:
#listToReduce = AO_observations['sentences2'].tolist() + DC_observations['sentences2'].tolist()

#corpus = reduceDimensionList(listToReduce)
#model = word2vec.Word2Vec(corpus, min_count=15, seed = 0)

In [12]:
def tsne_plot(model):

    labels = []
    tokens = []

    # Get words
    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
        
    # Get X & Y values
    tsne_model = TSNE(perplexity=perplexity, init='pca', n_iter=5000, random_state=0)
    new_values = tsne_model.fit_transform(tokens)
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        

    matrix = np.column_stack((x, y))
    clusters_number = 5
    kclusterer = KMeansClusterer(clusters_number,  distance=nltk.cluster.util.cosine_distance, repeats=25)
    assigned_clusters = kclusterer.cluster(matrix, assign_clusters=True)


    colors = cm.rainbow(np.linspace(0, 1, clusters_number))
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i], c=colors[assigned_clusters[i]])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
        
        
    title = "Mincount_10" \
    + " Files_" + str(len(DC_observations) + len(AO_observations)) \
    + " Perplexity" + str(perplexity) \
    + " FrenchStopword_" + str(isDeleteFrenchStopwords) \
    + " EnglishStopword_" + str(isDeleteEnglishStopwords) \
    + " Lower_" + str(isLower) \
    + " Lemmatization_" + str(isLemmatize) \
    + ".png"

                    
                    
    plt.savefig(title, bbox_inches='tight')
    
    plt.show()

In [None]:
#tsne_plot(model)


# All cases

In [None]:
AO_extract = extract(AO_directory)
DC_extract = extract(DC_directory)

for perplexity in perplexityPossibilities :
    for isDeleteFrenchStopwords in possibilities :
        for isDeleteEnglishStopwords in possibilities :
            for isLower in possibilities :
                for isLemmatize in possibilities :

                    print("preprocessing")

                    # Preprocessing datas
                    AO_observations = preprocessingAO(AO_extract)
                    DC_observations = preprocessingDC(DC_extract)

                    title = "Mincount_10" \
                    + " Files_" + str(len(DC_observations) + len(AO_observations)) \
                    + " Perplexity" + str(perplexity) \
                    + " FrenchStopword_" + str(isDeleteFrenchStopwords) \
                    + " EnglishStopword_" + str(isDeleteEnglishStopwords) \
                    + " Lower_" + str(isLower) \
                    + " Lemmatization_" + str(isLemmatize) \
                    + ".png"

                    print (title)



                    # Concat AO & DC
                    listToReduce = AO_observations['sentences2'].tolist() + DC_observations['sentences2'].tolist()

                    print("corpus")
                    # Plot model and save fig
                    corpus = reduceDimensionList(listToReduce)
                    model = word2vec.Word2Vec(corpus, min_count=20, seed = 0)

                    print("plot")
                    tsne_plot(model)

# Others preprocessing

In [None]:
def preprocessingGensim(observations):
    logging.info('Begin preprocessingGensim')
    
    
    observations['tf-idf'] = ""

    
    # Create a Corpus
    dictionary = Dictionary(observations["lemmatized"].tolist())
    corpus = [dictionary.doc2bow(text) for text in observations['lemmatized'].tolist()]
        
    # Create a new TfidfModel using the corpus
    tfidf = TfidfModel(corpus)
    
    for index, row in observations.iterrows():
        
        tfidf_weights = tfidf[corpus[index]]
        sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
        observations.loc[index, 'tf-idf'] = sorted_tfidf_weights
        

    logging.info('End preprocessingGensim')
    return observations

In [None]:
def preprocessingSpacy(observations):
    logging.info('Begin preprocessingSpaCy')
    
    
    observations['nlp'] = ""
    
    # Instantiate the nlp
    nlp = spacy.load('en_core_web_sm')
        
    for index, row in observations.iterrows():
        
        # Tokenization
        nlp = nlp(observations.loc[index, 'text'])
        observations.loc[index, 'nlp'] = nlp

    
    logging.info('End preprocessingSpaCy')
    return observations

In [None]:
def getColumn(start_index, end_index, sentences):

        if start_index != 0:
            start_index = start_index + 1
        
        if end_index == 0 :
            text = sentences[start_index:]
        else : 
            text = sentences[start_index:end_index]
                        
        return text 
    
        text = [word_tokenize(w) for w in text]

        my_text = []
        for sent in text :
            my_sent = []
            for word in sent :
                # Delete date or phone number
                #if not len(re.findall("[a-zA-Z]",word)) > 0: 
                #    continue
                    
                #lower the word
                if isLower:    
                    word = word.lower()
                    
                #lemmatize the word
                if isLemmatize:
                    word = nlp(word)[0].lemma_ 
                    
                # Delete french stopwords
                if isDeleteFrenchStopwords and (word in stopwords.words("french")):
                    continue
                
                # Delete english stopwords
                if isDeleteEnglishStopwords and (word in stopwords.words("english")):
                    continue
                   
                #append (or not) the word to the "sentence"
                if len(word) > 0:
                    my_sent.append(word)

            #append (or not) the sentence to the "text"
            if len(my_sent) > 0:
                my_text.append(my_sent)
                
        
        return my_text

# TU

In [815]:
#TU de Get date

year_pattern1 = '((19|20)\d{2})'
year_pattern2 = '(\d{2})'

month_pattern1 = "(0[1-9]|10|11|12)"
month_pattern2 = "(jan|fev|fév|mar|avr|mai|juin|juil|aout|sep|oct|nov|dec|déc)"
month_pattern3 = "(janvier|février|fevrier|mars|avril|mai|juin|juillet|aout|septembre|octobre|novembre|decembre|décembre) "
month_pattern = "(" + month_pattern1 + "|" + month_pattern2 + "|" + month_pattern3 + ")"


p1 = '06/2018 – 12/18 – Consultante – JEMS Group – Banque – Paris, France'
p2 = '06/2018-12/2018 – Mission : Chef de Projet Data Gouvernance – BNPParibas CIB'
p3 = 'Envt technique :  C# (version 4 et 5) & ASP.NET, SQL server 2008, Visual Studio 2015'
p4 = 'Windows Server 2012, 2008 et 2003'
p5 = 'Reprise des études pour intégrer les versions 2003 de Microsoft versus les versions 2000'
p6 = 'Janvier 2018 – Déc 18 – Consultante – JEMS Group – Banque – Paris, France'
p7 = 'Janvier 2018 – Féc 18 – Consultante – JEMS Group – Banque – Paris, France'
p8 = '2018 – Consultante – JEMS Group – Banque – Paris, France'
p9 = '2018-2019 – Consultante – JEMS Group – Banque – Paris, France'
p10 = "Mai 2014 à Décembre 2018 – Préfecture de RABAT, Ministère de l’Intérieur - MAROC"

text = p10.lower()
text = re.split("[^A-Za-z0-9é]+", text)
print(text)

results = []

for i in range(len(text)):
    
    # 01/2018 -> 02/2018 or Jan/2018 -> Fév/2018
    if (i != 0) and (re.match(year_pattern1, text[i])) and (re.match(month_pattern, text[i - 1])) :
        results.append("01/" + text[i - 1] + "/" + text[i])
        
    # YYYY -> YYYY
    elif (i != 0) and (re.match(year_pattern1, text[i])) and (re.match(year_pattern1, text[i - 1])) :
        results.append("01/" + "01/" + text[i - 1])
        results.append("01/" + "01/" + text[i])
        
    # 2018 
    elif (i == 0) and (re.match(year_pattern1, text[i])) :
        results.append("01/" + "01/" + text[i])

    

if len(results) == 1 :
    results.append("Aujourd'hui")
            

['mai', '2014', 'décembre', '2018', 'préfecture', 'de', 'rabat', 'minist', 're', 'de', 'l', 'intérieur', 'maroc']


In [816]:
results[0]

'01/mai/2014'

In [817]:
dateparser.parse(results[0])

datetime.datetime(2014, 5, 1, 0, 0)

In [818]:
results[1]

'01/décembre/2018'

In [819]:
dateparser.parse(results[1])

datetime.datetime(2018, 12, 1, 0, 0)

In [749]:
dateparser.parse(results[1]).month

6

In [75]:
DC_observations

Unnamed: 0,file_path,extension,text,sentences,sentences2,skills,exp,formation,Chef de projet,Unix,...,SonarQube,Maven,Nexus,Postgress,PHP,Perl,Lean,PowerBI,MapReduce,Kanban
0,data/input/DC\Dossier de Compétences JEMS Gr...,.pdf,1 \n\n \n\n \n\n \n\nSenior PMO \n\nS...,"[1, Senior PMO, SYNTHESE DE COMPETENCES, - Co...","[1, Senior PMO, SYNTHESE DE COMPETENCES, - Co...","[- Compétences fonctionnelles :, - Conduite ...",[06/2018 – 12/18 – Consultante – JEMS Group – ...,[❖ 2018 – Design thinking project management ...,Senior,Senior,...,,,,,,,,,,
1,data/input/DC\Dossier de compétences de JEMS ...,.pdf,RHA \n\nConsultant BIG DATA \n\nSYNTHESE DE CO...,"[RHA, Consultant BIG DATA, SYNTHESE DE COMPETE...","[RHA, Consultant BIG DATA, SYNTHESE DE COMPETE...","[ Savoir-faire :, - Connaissance de l’écosys...","[Depuis Avril 2018 JEMS Datafactory, ---...",[Diplôme d’ingénieur en génie mathématique opt...,,,...,,,,,,,,,,
2,data/input/DC\Dossier de Compétences JEMS - ...,.pdf,1 \n\n \n\n \n\nBSE \n\nConsultant Dat...,"[1, BSE, Consultant Data Sénior, Développement...","[1, BSE, Consultant Data Sénior, Développement...","[Scikit-learn, pandas , pyTorch/ fastai, Keras...",[ ...,"[2006, 1987, Master2 Mathématiques Fondamental...",Senior,Senior,...,,,,,,,,,,
3,data/input/DC\Dossier de compétences JEMS - ...,.pdf,\n\n \n \n\n \n\n \n\n \n\n \n\n \n \n\n \n\n...,"[Ingénieur d’études confirmé ASP.Net / C#, 7 a...","[Ingénieur d’études confirmé ASP.Net / C#, 7 a...","[SonarQube, Serveur d’application : IIS, Apa...",[Mai 2014 à Décembre 2018 – Préfecture de RABA...,[2014/2016 : MASTER Scientifique (Msc) Optio...,,,...,,,,,,,,,,
4,data/input/DC\Dossier de compétences JEMS - ...,.pdf,1 \n\n \n\n \n\n \n\n. \n\nConsultant ...,"[1, ., Consultant Technico-Fonctionnel, SYNTHE...","[1, ., Consultant Technico-Fonctionnel, SYNTHE...","[❖ Finance, Finance de marché, Gestion d’acti...",[ AXA Investment Partners – Front office Analy...,[],,,...,,,,,,,,,,
5,data/input/DC\Dossier de Compétences JEMS - A...,.pdf,1 \n\n \n\n \n\n \n\n \n \n\n \n\n \n\...,"[1, Développeur Fullstack, 6 années d’expérien...","[1, Développeur Fullstack, 6 années d’expérien...","[Métiers, Fonctionnelles, Etudes transverses f...","[Novembre 2017 – Mars 2019, Développeur Full S...",[2],,,...,,,,,,,,,,
6,data/input/DC\Dossier de Compétences JEMS - A...,.pdf,1 \n\n \n\n \n\nASA \n\nDATA SCIENTIST...,"[1, ASA, DATA SCIENTIST, SYNTHESE DE COMPETENC...","[1, ASA, DATA SCIENTIST, SYNTHESE DE COMPETENC...","[Domaines de compétences, Intelligence Artific...","[<IN-TEAM/ startup>, Domaine de compétences : ...",[ 2011-2013 : Diplôme national d’ingénieur...,,,...,,,,,,,,,,
7,data/input/DC\Dossier de Compétences JEMS - D...,.pdf,1 \n\n \n\n \n\nDK \n\nConsultant Big Data \n\...,"[1, DK, Consultant Big Data, SYNTHESE DE COMPE...","[1, DK, Consultant Big Data, SYNTHESE DE COMPE...","[❖ Compétences techniques :, - Programmation ...",[Février 2018 – Février 2019 : Data Scientist ...,"[❖ Juin 2001:, Baccalauréat grec ''Apolytirio...",,,...,,,,,,,,,,
8,data/input/DC\Dossier de Compétences JEMS - E...,.pdf,1 \n\n \n\n \n \n\n \n\n \n\n \n \n\n ...,"[1, E, Consultant Big Data, Ph.D, SYNTHESE DE ...","[1, E, Consultant Big Data, Ph.D, SYNTHESE DE ...","[ Compétences techniques, • Langages : Pytho...",[2019 – Consultant Big Data (Consultant Jems) ...,"[OBJECTIFS :, ➢ Définition du périmètre du Big...",,,...,,,,,,,,,,
9,data/input/DC\Dossier de Compétences JEMS - E...,.pdf,1 \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n...,"[1, EZA, Consultant DevOps, SYNTHESE DE COMPET...","[1, EZA, Consultant DevOps, SYNTHESE DE COMPET...","[❖ Compétences techniques :, Système OS : ...",[Octobre 2018 – Janvier 2019 : Consultant DEV...,[❖ Participation au développement de nouvelle...,,Senior,...,,,,,,,,,,


In [225]:
potential_skills_dict.keys()

dict_keys(['Developer', 'DataEngineer', 'DataScientist', 'Chef de projet', 'Unix', 'Windows', 'Mac', 'IOS', 'Android', 'SQL', 'SQLite', 'MySQL', 'Oracle', 'NoSQL', 'Postgress', 'Apache Hadoop', 'PIG', 'HIVE', 'MapReduce', 'Apache Spark', 'Kafka', 'Tableau', 'PowerBI', 'Qlik', 'SAS', 'IntelliJ', 'SonarQube', 'Maven', 'Nexus', 'Jenkins', 'AWS', 'Microsoft Azure', 'Google Cloud Platform', 'java', 'C', 'C++', 'C#', '.NET', 'Matlab', 'R', 'Python', 'VHDL', 'PHP', 'Swift', 'TypeScript', 'GO', 'Ruby', 'Perl', 'Kotlin', 'Rust', 'Scala', 'JavaScript', 'HTML', 'Cycle en V', 'Agile', 'Kanban', 'Lean', 'Design Thinking', 'sklearn', 'tensorflow', 'keras', 'h20', 'Windows Phone', 'French', 'Dutch', 'English', 'German', 'Spanish', 'Chinese', 'github', 'bitbucket', 'gitlab', 'sourceforge', 'gitkraken'])