# Suite de Traitements

## Configuration

In [9]:
METHODES_DE_VECTORISATION = {'BoW' : 1, 'TF-IDF' : 2}
METHODES_DE_CLASSIFICATION = {'KNN' : 1}

methode_de_vectorisation = METHODES_DE_VECTORISATION['BoW']
methode_de_classification = METHODES_DE_CLASSIFICATION['KNN']

CHUNK = 70
PRINTEVERYNUMBER = 1000

## Initialisation

In [10]:
import pandas as pd
import numpy as np
import pickle
import re
import matplotlib.pyplot as plt

# Méthodes de Vectorisation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Pour l'étape de classification
from sklearn.model_selection import train_test_split
from sklearn import pipeline, metrics
from sklearn.neighbors import KNeighborsClassifier

# Données
#df = pd.read_json("data.json")
#label = pd.read_csv("label.csv")
#category = pd.read_csv("categories_string.csv")

## Fonctions utiles

In [11]:
def accuracyByChunks(trained_model, X_test, Y_test, Chunk, PrintEveryNumber):
    if(Chunk > 0):
        max_X_test = len(X_test) # Utilisé pour l'affichage
        next_step = PrintEveryNumber
        quantity = 0
        accuracy = 0
        cursor_start = 0
        cursor_end = 0
        
        while ((cursor_start < len(X_test)) and (cursor_end < len(X_test))):
            new_accuracy = 0
            cursor_end = cursor_start + Chunk
            number_of_elements = Chunk
            if(Chunk > 1):
                if(cursor_end < len(X_test)):
                    Y_test_prediction = trained_model.predict(X_test[cursor_start:cursor_end])
                    new_accuracy = metrics.accuracy_score(Y_test[cursor_start:cursor_end], Y_test_prediction)
                else:
                    number_of_elements = len(X_test) - cursor_start
                    Y_test_prediction = trained_model.predict(X_test[cursor_start:len(X_test)])
                    new_accuracy = metrics.accuracy_score(Y_test[cursor_start:len(X_test)], Y_test_prediction)
            else:
                Y_test_prediction = trained_model.predict(X_test[cursor_start:cursor_end])
                new_accuracy = metrics.accuracy_score(Y_test[cursor_start:cursor_end], Y_test_prediction)
                    
            accuracy = (quantity * accuracy + number_of_elements * new_accuracy) / (quantity + number_of_elements)
            quantity = quantity + number_of_elements
            if(quantity >= next_step):
                print("Avancement : " + str(quantity) + "/" + str(max_X_test))
                if(next_step + PrintEveryNumber <= max_X_test):
                    next_step = next_step + PrintEveryNumber
                else:
                    next_step = max_X_test
                
            cursor_start = cursor_start + Chunk
        return accuracy
    else:
        print("Chunk must be > 0!\nChunk given : " + str(Chunk))
        return -1

## Prétraitement

In [12]:
descriptions = [x.lower() for x in pd.read_json("../data.json").description] # Tout en minuscule
descriptions = [re.sub('[^A-Za-z]',' ',desc) for desc in descriptions] # On ne garde que les lettres.
descriptions = [re.sub(r'\b\w{1,3}\b',' ',desc) for desc in descriptions] # On ne garde que les mots de plus de 3 lettres

def preTraitement(description): # Fonction utilisée pour une évaluation individuelle
    cleaned_description = description.lower()
    cleaned_description = re.sub('[^A-Za-z]',' ',cleaned_description)
    cleaned_description = re.sub(r'\b\w{1,3}\b',' ',cleaned_description)
    return cleaned_description

## Vectorisation

In [13]:
if(methode_de_vectorisation == METHODES_DE_VECTORISATION['BoW']):
    vectorizer = CountVectorizer(dtype=np.uint8) # Au vue de nos données d'entrées, nos CV font moins de 250 mots, Donc le nombre maximal est < 255
elif(methode_de_vectorisation == METHODES_DE_VECTORISATION['TF-IDF']):
    vectorizer = TfidfVectorizer()
else:
    print("Numéro de méthode de vectorisation non pris en charge : " + str(methode_de_vectorisation))
    
vectorizer.fit_transform(descriptions)
dic_vocabulary = vectorizer.vocabulary_
print("Taille du vocabulaire : " + str(len(dic_vocabulary)))

Taille du vocabulaire : 199464


### Classification

In [14]:
[descriptions_train, descriptions_test, labels_train, labels_test] = train_test_split(descriptions, pd.read_csv("label.csv").Category, random_state=0)

if(methode_de_classification == METHODES_DE_CLASSIFICATION['KNN']):
    KNNClassifier = KNeighborsClassifier(3, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski')
    model = pipeline.Pipeline([("vectorizer", vectorizer), ("classifier", KNNClassifier)])
else:
    print("Numéro de méthode de classification non pris en charge : " + str(methode_de_classification))

model["classifier"].fit(vectorizer.transform(descriptions_train), labels_train)
accuracy_train = accuracyByChunks(model, descriptions_train, labels_train, CHUNK, PRINTEVERYNUMBER)
accuracy_test = accuracyByChunks(model, descriptions_test, labels_test, CHUNK, PRINTEVERYNUMBER)
print("Précision Train = " + str(accuracy_train) + "\nPrécision Test = " + str(accuracy_test))

Avancement : 1050/162897
Avancement : 2030/162897
Avancement : 3010/162897
Avancement : 4060/162897
Avancement : 5040/162897
Avancement : 6020/162897
Avancement : 7000/162897
Avancement : 8050/162897
Avancement : 9030/162897
Avancement : 10010/162897
Avancement : 11060/162897
Avancement : 12040/162897
Avancement : 13020/162897
Avancement : 14000/162897
Avancement : 15050/162897
Avancement : 16030/162897
Avancement : 17010/162897
Avancement : 18060/162897
Avancement : 19040/162897
Avancement : 20020/162897
Avancement : 21000/162897
Avancement : 22050/162897
Avancement : 23030/162897
Avancement : 24010/162897
Avancement : 25060/162897
Avancement : 26040/162897
Avancement : 27020/162897
Avancement : 28000/162897
Avancement : 29050/162897
Avancement : 30030/162897
Avancement : 31010/162897
Avancement : 32060/162897
Avancement : 33040/162897
Avancement : 34020/162897
Avancement : 35000/162897
Avancement : 36050/162897
Avancement : 37030/162897
Avancement : 38010/162897
Avancement : 39060/16

# Test avec mon CV

In [15]:
ma_description = "He is a last-year student at Telecom Saint-Etienne and majoring in computer science and image processing. He created a lot of software for various companies. Web development, mobile development, native android and so on."
ma_description = preTraitement(ma_description)
categorie_correspondante = model.predict([ma_description])
print("Catégorie correspondante : " + pd.read_csv("categories_string.csv")['0'][categorie_correspondante[0]])

Catégorie correspondante : software_engineer
