# Catégorisation automatique de questions

## Initialisation

In [1]:
# Standard libraries
import itertools
import os
import re
import sys

# External libraries
import bs4
import matplotlib 
import matplotlib.pyplot as plt
import nltk
import nltk.stem.porter
import numpy as np
import pandas as pd
import pickle
import pyLDAvis
import pyLDAvis.sklearn
import scipy
import seaborn as sns
import spacy
import sklearn as sk
import sklearn.decomposition
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.feature_selection
import sklearn.model_selection
import sklearn.multiclass
import sklearn.multioutput
import sklearn.naive_bayes
import sklearn.svm
import string

  from .optimizers import Adam, SGD, linear_decay
  from collections import defaultdict, Sequence, Sized, Iterable, Callable


In [2]:
# Fichiers de données sauvegardés par le calepin d'exploration
DATA_FILE = "data/P6 all labels.dat"
DATA_FILE_SINGLE_LABEL = "data/P6 single label.dat"

# Ne pas exécuter les sections à temps de calcul élever
CALC_SLOW_SECTIONS = False

SEED = 1911

In [3]:
data = pickle.load(open(DATA_FILE, 'rb'))
df, tags, id_to_tag, features = data
df1 = pickle.load(open(DATA_FILE_SINGLE_LABEL, 'rb'))

In [4]:
np.unique(tags.toarray())

array([0, 1], dtype=int64)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42096 entries, 1 to 49457
Data columns (total 3 columns):
Id      42096 non-null int64
Tags    42096 non-null object
Text    42096 non-null object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38721 entries, 1 to 49457
Data columns (total 4 columns):
Id         38721 non-null int64
Tag        38721 non-null object
TagCode    38721 non-null int64
Text       38721 non-null object
dtypes: int64(2), object(2)
memory usage: 1.5+ MB


## Classement mono-label

### Représentation des messages en matrice TFIDF

In [7]:
# On ne conserve que les labels fréquents
#df1 = df[~del_mask].copy()
# Utilisation de `min_df` nécessaire pour réduire la quantité de mots
fv = TfidfVectorizer(analyzer='word', ngram_range=(1,2), 
                     min_df=50, stop_words='english')
text_tfidf = fv.fit_transform(df1['Text'])
print(f"Taille de la matrice Tfidf : {text_tfidf.shape}")

Taille de la matrice Tfidf : (38721, 7578)


In [8]:
print(f"{len(fv.stop_words_)} termes exclus")
#print(" ".join(fv.stop_words_))

1909117 termes exclus


In [9]:
X = text_tfidf
y = df1['TagCode']
X_tr, X_t, y_tr, y_t = sk.model_selection.train_test_split(X, y, 
                                                           test_size=0.3, 
                                                           random_state=SEED)
X_tr.shape, y_tr.shape, X_t.shape, y_t.shape

((27104, 7578), (27104,), (11617, 7578), (11617,))

### Mots et bi-grammes les plus corrélés avec chaque catégorie

In [10]:
n_terms = 3
for label, id in [(id_to_tag[id], id) for id in df1['TagCode'].unique()]:
    chi2 = sk.feature_selection.chi2(text_tfidf.toarray(), 
                                     df1['TagCode'] == id)
    indices = np.argsort(chi2[0])
    feature_names = np.array(fv.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]        
    print(label, ":", unigrams[-n_terms:], bigrams[-n_terms:])

python : ['def', 'self', 'python'] ['nan nan', 'lib python', 'site package']
java : ['jar', 'println', 'java'] ['override public', 'import java', 'public void']
node.js : ['node', 'express', 'nodejs'] ['req body', 'res send', 'req res']
android : ['fragment', 'layout', 'android'] ['android app', 'android studio', 'android layout']
vue : ['vuex', 'vuetify', 'vue'] ['vue vue', 'vue component', 'vue app']
flutter : ['widget', 'dart', 'flutter'] ['widget build', 'children widget', 'buildcontext context']
c++ : ['cpp', 'cout', 'std'] ['include iostream', 'std cout', 'std string']
javascript : ['var', 'div', 'javascript'] ['document queryselector', 'document getelementbyid', 'console log']
ruby-on-rails : ['gem', 'activerecord', 'rail'] ['end def', 'end end', 'ruby rail']
css : ['border', 'background', 'css'] ['text decoration', 'background color', 'css file']
django : ['charfield', 'model', 'django'] ['charfield max', 'model charfield', 'rest framework']
c# : ['writeline', 'public', 'csharp

### Modèle bayésien naïf multinomial 

In [11]:
model = sk.naive_bayes.MultinomialNB()
model.fit(X_tr, y_tr)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
def print_scores(y, y_pred):
    score = sk.metrics.recall_score(y, y_pred, average='micro')    
    print(f"Rappel = {score:0.2f}")
    score = sk.metrics.precision_score(y, y_pred, average='micro')
    print(f"Précision = {score:0.2f}")
    score = sk.metrics.f1_score(y, y_pred, average='micro')
    print(f"F1 = {score:0.2f}")    
    score = sk.metrics.accuracy_score(y, y_pred)    
    print(f"Justesse = {score:0.2f}")    
    print()

In [13]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(y_t, y_pred)

*** Scores sur données d'entraînement
Rappel = 0.52
Précision = 0.52
F1 = 0.52
Justesse = 0.52

*** Scores sur données de test :
Rappel = 0.49
Précision = 0.49
F1 = 0.49
Justesse = 0.49



In [14]:
names = [id_to_tag[i] for i in y_t.sort_values().unique()]
print(sk.metrics.classification_report(y_t, y_pred, 
                                       target_names=names))

  'precision', 'predicted', average, warn_for)


               precision    recall  f1-score   support

       python       0.43      0.95      0.59      2022
         java       0.58      0.66      0.62       881
      node.js       0.58      0.12      0.20       216
      android       0.65      0.59      0.62       507
          vue       0.00      0.00      0.00        67
      flutter       0.95      0.44      0.60       122
          c++       0.80      0.53      0.64       279
   javascript       0.39      0.87      0.54      1476
ruby-on-rails       1.00      0.01      0.02        93
          css       0.00      0.00      0.00       109
       django       0.50      0.01      0.02       120
           c#       0.57      0.68      0.62       743
   typescript       0.00      0.00      0.00        70
      laravel       0.00      0.00      0.00       103
          php       0.64      0.56      0.60       587
     firebase       0.00      0.00      0.00        43
       jquery       0.00      0.00      0.00        85
         

In [15]:
# Example
row = 0
v = text_tfidf[row, :]
predicted = model.predict(v)[0]
expected = df1.iloc[row]['TagCode']
print(f"Labels prédit et attendu pour ligne {row} : {predicted, expected}")
df.iloc[row]

Labels prédit et attendu pour ligne 0 : (0, 0)


Id                                               57560002
Tags    <python><amazon-web-services><amazon-s3><boto3...
Text    python script size sthree bucket clanguagesv s...
Name: 1, dtype: object

### Régression logistique

In [16]:
model = sk.linear_model.LogisticRegression(multi_class='multinomial', 
                                           solver='sag')
model.fit(X_tr, y_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(y_t, y_pred)

*** Scores sur données d'entraînement
Rappel = 0.74
Précision = 0.74
F1 = 0.74
Justesse = 0.74

*** Scores sur données de test :
Rappel = 0.63
Précision = 0.63
F1 = 0.63
Justesse = 0.63



In [18]:
names = [id_to_tag[i] for i in y_t.sort_values().unique()]
print(sk.metrics.classification_report(y_t, y_pred, 
                                       target_names=names))

               precision    recall  f1-score   support

       python       0.65      0.91      0.76      2022
         java       0.63      0.68      0.65       881
      node.js       0.52      0.38      0.44       216
      android       0.67      0.70      0.68       507
          vue       0.42      0.12      0.19        67
      flutter       0.81      0.63      0.71       122
          c++       0.81      0.72      0.76       279
   javascript       0.52      0.76      0.62      1476
ruby-on-rails       0.83      0.57      0.68        93
          css       0.50      0.22      0.31       109
       django       0.52      0.36      0.42       120
           c#       0.65      0.75      0.69       743
   typescript       0.52      0.23      0.32        70
      laravel       0.64      0.17      0.27       103
          php       0.66      0.74      0.70       587
     firebase       0.48      0.33      0.39        43
       jquery       0.40      0.05      0.08        85
         

### SVM linéaire

#### Optimisation des hyperparamètres

In [19]:
base = sk.svm.LinearSVC(dual=False, random_state=SEED)
grid = {'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'max_iter': [1000, 2000, 5000]
       }
model = sk.model_selection.GridSearchCV(base,
                                        param_grid=grid,
                                        cv=5,
                                        n_jobs=-1)
if CALC_SLOW_SECTIONS:
    model.fit(X_tr, y_tr)
    model.best_params_

Résultats : {'C': 1, 'penalty': 'l1', 'max_iter': 1000}

#### Entraînement du modèle

In [20]:
model = sk.svm.LinearSVC(C=1.0, penalty='l1', max_iter=1000, dual=False)
model.fit(X_tr, y_tr)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
          verbose=0)

In [21]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(y_t, y_pred)

*** Scores sur données d'entraînement
Rappel = 0.84
Précision = 0.84
F1 = 0.84
Justesse = 0.84

*** Scores sur données de test :
Rappel = 0.66
Précision = 0.66
F1 = 0.66
Justesse = 0.66




## Classement multilabel

### Réduction de dimension de la matrice documents-termes
Pas nécessaire pour l'instant, mais à faire quand même

In [22]:
fv = TfidfVectorizer(analyzer='word', min_df=20, max_df=0.3, 
                     ngram_range=(1, 1))
text_tfidf = fv.fit_transform(df['Text'])
print(f"Taille de la matrice TFIDF : {text_tfidf.shape}")
X = text_tfidf
y = tags
X_tr, X_t, y_tr, y_t = sk.model_selection.train_test_split(X, y, 
                                                           test_size=0.3, 
                                                           random_state=SEED)
X_tr.shape, y_tr.shape, X_t.shape, y_t.shape

Taille de la matrice TFIDF : (42096, 7035)


((29467, 7035), (29467, 150), (12629, 7035), (12629, 150))

In [23]:
print(f"{len(fv.stop_words_)} termes exclus")
#print(" ".join(fv.stop_words_))

191709 termes exclus


### Classement indépendant par catégorie

#### Modèle MultinomialNB

In [24]:
y_pred_t = scipy.sparse.lil_matrix(y_t.shape)
y_pred_tr = scipy.sparse.lil_matrix(y_tr.shape)
model = sk.naive_bayes.MultinomialNB(fit_prior=True, 
                                     class_prior=None)
for i, feature in enumerate(features):
    y_label_tr = y_tr[:, i].toarray().reshape(-1)
    y_label_t = y_t[:, i].toarray().reshape(-1)
    model.fit(X_tr.toarray(), y_label_tr)    
    y_pred1_tr = model.predict(X_tr)
    y_pred1_t = model.predict(X_t)
    score_tr = sk.metrics.recall_score(y_label_tr, y_pred1_tr)
    score_t = sk.metrics.recall_score(y_label_t, y_pred1_t)
    y_pred_t[:, i] = y_pred1_t.reshape(-1, 1)
    y_pred_tr[:, i] = y_pred1_tr.reshape(-1, 1)
    print(f"{feature} ({score_tr*100:0.0f}%, {score_t*100:0.0f}%)", end=' ')

.net (0%, 0%) ajax (0%, 0%) algorithm (0%, 0%) amazon (11%, 6%) android (48%, 47%) angular (15%, 14%) ansible (0%, 0%) apache (7%, 5%) api (0%, 0%) arrays (0%, 0%) asp.net (6%, 5%) asynchronous (0%, 0%) authentication (0%, 0%) aws (0%, 0%) azure (1%, 4%) bash (1%, 0%) beautifulsoup (0%, 0%) bootstrap (1%, 0%) c (1%, 1%) c# (19%, 14%) c++ (18%, 16%) class (0%, 0%) css (35%, 36%) csv (0%, 0%) dart (3%, 2%) database (0%, 0%) dataframe (0%, 0%) date (0%, 0%) datetime (0%, 0%) deep-learning (0%, 0%) dictionary (0%, 0%) django (10%, 11%) docker (10%, 9%) dplyr (0%, 0%) ecma (0%, 0%) elasticsearch (0%, 2%) eloquent (0%, 0%) entity-framework (0%, 0%) excel (27%, 21%) express (0%, 0%) facebook (0%, 0%) file (0%, 0%) firebase (4%, 5%) flask (0%, 0%) flutter (28%, 24%) for-loop (0%, 0%) forms (0%, 0%) function (0%, 0%) ggplot2 (0%, 0%) git (1%, 0%) github (0%, 0%) go (0%, 0%) google-app (3%, 0%) google-chrome (0%, 0%) google-cloud (0%, 0%) google-maps (0%, 0%) google-sheets (0%, 0%) gradle (0%, 0

In [25]:
print_scores(y_t, y_pred_t)

Rappel = 0.16
Précision = 0.80
F1 = 0.26
Justesse = 0.09



Rappel = 0.05
Précision = 0.78
Justesse = 0.09
F1 = 0.02

#### Régression logistique

In [26]:
filename = "data/P6-MultiLabelLogisticRegressionOutput.sav"

if CALC_SLOW_SECTIONS:
    y_pred_t = scipy.sparse.lil_matrix(y_t.shape)
    y_pred_tr = scipy.sparse.lil_matrix(y_tr.shape)
    model = sk.linear_model.LogisticRegression(multi_class='multinomial', 
                                               solver='sag', n_jobs=4)

    for i, f in enumerate(features):
        y_label_tr = y_tr[:, i].toarray().reshape(-1)
        y_label_t = y_t[:, i].toarray().reshape(-1)
        model.fit(X_tr.toarray(), y_label_tr)    
        y_pred1_tr = model.predict(X_tr)
        y_pred1_t = model.predict(X_t)
        score_tr = sk.metrics.recall_score(y_label_tr, y_pred1_tr)
        score_t = sk.metrics.recall_score(y_label_t, y_pred1_t)
        y_pred_t[:, i] = y_pred1_t.reshape(-1, 1)
        y_pred_tr[:, i] = y_pred1_tr.reshape(-1, 1)
        print(f"{f} ({score_tr*100:0.0f}%, {score_t*100:0.0f}%)", end=' ')
        data = (y_pred_tr, y_pred_t)
        pickle.dump(data, open(filename, 'wb'))
else:
    y_pred_tr, y_pred_t = pickle.load(open(filename, 'rb'))
    print(y_pred_tr.shape, y_pred_t.shape)

.net (9%, 6%) ajax (40%, 31%) algorithm (4%, 2%) amazon (53%, 48%) android (72%, 67%) angular (65%, 60%) ansible (52%, 44%) apache (42%, 33%) api (5%, 4%) arrays (25%, 21%) asp.net (46%, 39%) asynchronous (3%, 0%) authentication (3%, 0%) aws (31%, 25%) azure (64%, 54%) bash (30%, 25%) beautifulsoup (43%, 21%) bootstrap (23%, 12%) c (42%, 29%) c# (53%, 42%) c++ (60%, 49%) class (4%, 0%) css (58%, 50%) csv (31%, 17%) dart (27%, 25%) database (1%, 0%) dataframe (16%, 19%) date (3%, 3%) datetime (16%, 11%) deep-learning (3%, 3%) dictionary (17%, 12%) django (69%, 64%) docker (71%, 60%) dplyr (4%, 7%) ecma (0%, 0%) elasticsearch (51%, 33%) eloquent (10%, 0%) entity-framework (3%, 3%) excel (61%, 57%) express (34%, 23%) facebook (55%, 39%) file (0%, 2%) firebase (56%, 54%) flask (50%, 38%) flutter (74%, 60%) for-loop (4%, 2%) forms (4%, 0%) function (2%, 0%) ggplot2 (61%, 53%) git (62%, 45%) github (22%, 19%) go (27%, 16%) google-app (42%, 27%) google-chrome (28%, 16%) google-cloud (45%, 38%

In [27]:
print("*** Scores sur données d'entraînement")
print_scores(y_tr, y_pred_tr)
print("*** Scores sur données de test :")
print_scores(y_t, y_pred_t)

*** Scores sur données d'entraînement
Rappel = 0.49
Précision = 0.89
F1 = 0.63
Justesse = 0.34

*** Scores sur données de test :
Rappel = 0.41
Précision = 0.83
F1 = 0.55
Justesse = 0.25



#### SVM linéaire

In [28]:
i = 0
f = features[i]
base = sk.svm.LinearSVC(dual=False, random_state=SEED)
grid = {'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'max_iter': [1000, 2000, 5000]
       }
model = sk.model_selection.GridSearchCV(base,
                                        param_grid=grid,
                                        cv=5,
                                        n_jobs=1)
y_label_tr = y_tr[:, i].toarray().reshape(-1)
model.fit(X_tr.toarray(), y_label_tr)    
model.best_params_



{'C': 1, 'max_iter': 1000, 'penalty': 'l2'}

In [29]:
y_pred_t = scipy.sparse.lil_matrix(y_t.shape)
y_pred_tr = scipy.sparse.lil_matrix(y_tr.shape)
model = sk.svm.LinearSVC(C=1.0, penalty='l1', max_iter=1000, 
                         multi_class='ovr', dual=False)

for i, f in enumerate(features):
    y_label_tr = y_tr[:, i].toarray().reshape(-1)
    y_label_t = y_t[:, i].toarray().reshape(-1)
    model.fit(X_tr.toarray(), y_label_tr)    
    y_pred1_tr = model.predict(X_tr)
    y_pred1_t = model.predict(X_t)
    score_tr = sk.metrics.recall_score(y_label_tr, y_pred1_tr)
    score_t = sk.metrics.recall_score(y_label_t, y_pred1_t)
    y_pred_t[:, i] = y_pred1_t.reshape(-1, 1)
    y_pred_tr[:, i] = y_pred1_tr.reshape(-1, 1)
    print(f"{f} ({score_tr*100:0.0f}%, {score_t*100:0.0f}%)", end=' ')
    data = (y_pred_tr, y_pred_t)
    pickle.dump(data, open(filename, 'wb'))

.net (19%, 15%) ajax (56%, 45%) algorithm (14%, 6%) amazon (73%, 65%) android (81%, 71%) angular (81%, 78%) ansible (90%, 93%) apache (64%, 49%) api (0%, 0%) arrays (29%, 18%) asp.net (61%, 54%) asynchronous (4%, 0%) authentication (1%, 0%) aws (49%, 43%) azure (79%, 72%) bash (48%, 41%) beautifulsoup (63%, 48%) bootstrap (38%, 26%) c (62%, 49%) c# (69%, 57%) c++ (75%, 61%) class (1%, 0%) css (68%, 52%) csv (44%, 21%) dart (48%, 31%) database (2%, 0%) dataframe (23%, 11%) date (1%, 0%) datetime (22%, 15%) deep-learning (10%, 0%) dictionary (22%, 17%) django (88%, 81%) docker (79%, 71%) dplyr (40%, 33%) ecma (0%, 0%) elasticsearch (80%, 61%) eloquent (36%, 23%) entity-framework (17%, 9%) excel (76%, 68%) express (44%, 29%) facebook (75%, 59%) file (0%, 0%) firebase (73%, 62%) flask (71%, 66%) flutter (88%, 80%) for-loop (3%, 2%) forms (2%, 0%) function (1%, 0%) ggplot2 (83%, 76%) git (77%, 70%) github (39%, 42%) go (57%, 48%) google-app (56%, 40%) google-chrome (43%, 30%) google-cloud (

In [30]:
print("*** Scores sur données d'entraînement")
print_scores(y_tr, y_pred_tr)
print("*** Scores sur données de test :")
print_scores(y_t, y_pred_t)

*** Scores sur données d'entraînement
Rappel = 0.62
Précision = 0.91
F1 = 0.74
Justesse = 0.45

*** Scores sur données de test :
Rappel = 0.51
Précision = 0.81
F1 = 0.63
Justesse = 0.32



### Utilisation de OneVsRestClassifier

In [None]:
base = sk.linear_model.LogisticRegression(multi_class='multinomial',
                                          solver='sag',
                                          n_jobs=-1)
model = sk.multiclass.OneVsRestClassifier(base)
model.fit(X_tr.toarray(), y_tr)

In [None]:
print("*** Scores sur données d'entraînement")
y_pred_tr = model.predict(X_tr)
print_scores(y_tr, y_pred_tr)
print("*** Scores sur données de test :")
y_pred_t = model.predict(X_t)
print_scores(y_t, y_pred_t)

Mêmes résultats que précédemment.

In [None]:
y_correct = y_t.toarray()
for row in range(30):
    y_pred_tr_row = y_pred_t[row, :].toarray()[0]
    pred = [id_to_tag[i] for i in np.where(y_pred_tr_row == 1)[0] ]
    labels = [id_to_tag[i] for i in np.where(y_correct[row, :] == 1)[0] ]
    print(row, pred, labels)

### Classement en chaîne (ClassifierChain)

#### Régression logistique

In [None]:
filename = "data/P6-ClassifierChain.sav"

if CALC_SLOW_SECTIONS:
    base = sk.linear_model.LogisticRegression(multi_class='multinomial', 
                                          solver='sag')
    model = sklearn.multioutput.ClassifierChain(base)
    model.fit(X_tr.toarray(), y_tr)
    pickle.dump(model, open(filename, "wb"))
else:
    model = pickle.load(open(filename, "rb" ))
    
model

In [None]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(y_t, y_pred)

#### SVM linéaire

In [None]:
#filename = "data/P6-MultiLabelSVMOutput.sav"

base = sk.svm.LinearSVC(C=1.0, penalty='l1', max_iter=1000, 
                         multi_class='ovr', dual=False)
model = sklearn.multioutput.ClassifierChain(base)
model.fit(X_tr.toarray(), y_tr.toarray())

In [None]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(y_t, y_pred)

Performances légèrement supérieures à celles d'une SVM simple.

## Analyse non supervisée : LDA

### Préparation des données

In [28]:
cv = CountVectorizer(min_df=20, max_df=0.3)
text_words = cv.fit_transform(df['Text'])
print(f"Taille de la matrice de vocabulaire : {text_words.shape}")

Taille de la matrice de vocabulaire : (42096, 7035)


In [29]:
words_tr, words_t = sk.model_selection.train_test_split(text_words, 
                                                        test_size=0.3, 
                                                        random_state=SEED)
words_tr.shape, words_t.shape

((29467, 7035), (12629, 7035))

### Optimisation des hyperparamètres

#### Recherche sur grille avec le score par défaut

In [None]:
grid = {'n_components': [10, 20, 25, 30, 40, 50], 
        'learning_decay': [.5, .7, .9]
       }
base = sk.decomposition.LatentDirichletAllocation(random_state=SEED)
model = sk.model_selection.GridSearchCV(base, param_grid=grid, cv=5, n_jobs=6)
if CALC_SLOW_SECTIONS:
    model.fit(text_words)
    model.best_params_

#### Recherche sur grille avec score de perplexité

In [None]:
# LDA retournant score = -perplexité
class LDAWithPerplexityScorer(sk.decomposition.LatentDirichletAllocation):
    def score(self, X, y=None):
        score = super(LDAWithPerplexityScorer, self).perplexity(X)
        return -1 * score

In [None]:
grid = {'n_components': [10, 20, 25, 30, 40, 50], 
        'learning_decay': [.5, .7, .9]
       }
base = LDAWithPerplexityScorer(random_state=SEED)
model = sk.model_selection.GridSearchCV(base, param_grid=grid, cv=5, n_jobs=-1)
if CALC_SLOW_SECTIONS:
    model.fit(text_words)
    model.best_params_

Résultats sauvegardés (identiques avec les 2 méthodes d'évaluation) : 

{'learning_decay': 0.5, 'n_components': 10}

#### Recherche manuelle

In [None]:
n_components = [10, 25, 50, 75, 100, 125, 150]
scores = []
if CALC_SLOW_SECTIONS:
    for n in n_components:
        lda = sk.decomposition.LatentDirichletAllocation(n_components=n,
                                                         learning_decay=0.5,
                                                         random_state=SEED,
                                                         n_jobs=2)
        lda.fit(words_tr)
        score = lda.perplexity(words_t)
        #print(n, score)
        scores.append(score)
else:
    scores = [1062.17, 943.89, 888.53, 878.74, 884.63, 889.28, 905.38]
axes = sns.lineplot(n_components, scores);
axes.set_title("Score de perplexité en fonction du nombre de sujets");

Nous retenons 75 comme valeur optimale de `n_components`.

### Entraînement du modèle

In [30]:
lda = sk.decomposition.LatentDirichletAllocation(n_components=75,
                                                 learning_decay=0.5,
                                                 learning_method='online',
                                                 random_state=SEED,
                                                 n_jobs=1)
output = lda.fit_transform(text_words)
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.5,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=75, n_jobs=1,
                          perp_tol=0.1, random_state=1911,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [34]:
n_keywords = 5
words = cv.get_feature_names()
for i, topic in enumerate(lda.components_):
    keywords = [words[i] + f"({topic[i]:.1f})" for i in topic.argsort()[:-n_keywords-1:-1]]
    print(i, " ".join(keywords))
# afficher des examples

0 user(22123.6) password(5016.0) login(3995.5) auth(2518.9) job(2494.3)
1 page(7760.0) element(5449.1) elements(2465.3) apply(2168.6) part(1656.1)
2 delete(4542.8) tag(4281.5) home(2500.6) route(2335.3) env(1979.1)
3 message(9971.5) service(7550.0) microsoft(2080.0) azure(1841.7) warn(1635.5)
4 php(7969.3) search(4861.9) category(3370.9) result(3310.9) echo(2851.5)
5 feature(2201.7) arr(1616.6) age(1461.8) learn(1424.2) basic(1338.7)
6 event(6713.7) angular(3757.0) template(3589.2) root(2500.7) constructor(1503.5)
7 instance(3520.4) address(3479.8) cache(3121.5) book(2358.9) builder(2162.6)
8 point(4417.4) obj(2188.3) stack(1879.3) limit(1671.6) memory(1465.5)
9 total(2725.2) account(2297.8) share(2203.4) comment(2153.0) employee(1551.5)
10 self(17936.8) def(3934.7) let(3215.6) return(2717.9) func(2596.4)
11 cluster(1472.5) ignore(899.7) star(830.7) demo(728.1) slide(722.1)
12 access(5747.9) application(5647.5) config(3865.6) open(3423.9) issue(1970.7)
13 table(15204.7) create(4679.9) 

In [7]:
n_keywords = 10
words = cv.get_feature_names()
for i, topic in enumerate(lda.components_):
    keywords = [words[i] for i in topic.argsort()[:-n_keywords-1:-1]]
    print(i, " ".join(keywords))
# afficher des examples

0 user password login auth job controller admin redirect timestamp laravel
1 page element elements apply part mobile find amp fix embed
2 delete tag home route env git master params old rout
3 message service microsoft azure warn send configure configuration pipeline publish
4 php search category result echo entry post param products function
5 feature arr age learn basic area turn course keyword reduce
6 event angular template root constructor events doc disable subscribe common
7 instance address cache book builder final rule listener webview management
8 point obj stack limit memory dict dictionary bite printf trace
9 total account share comment employee unique apps round summary permissions
10 self def let return func init class nil struct counter
11 cluster ignore star demo slide pygame rail ruby slider rat
12 access application config open issue register network allow set fine
13 table create program insert database answer question year company many
14 print line import output st

### Visualisation

In [11]:
def clean_tags(s):
    l = re.findall(r'\<(.*?)\>', s)
    return " ".join(l)

In [48]:
output = lda.transform(text_words) # Matrice documents-sujets
topic_names = ["S" + str(i) for i in range(lda.n_components)]
doc_names = ["D" + str(i) for i in range(len(df))]
df_topics = pd.DataFrame(np.round(output, 2), 
                         columns=topic_names, 
                         index=doc_names)
dominant_topic = np.argmax(df_topics.values, axis=1)
sorted_output = np.argsort(-output, axis=1)
df_topics['First_topic'] = sorted_output[:, 0]
df_topics['Second_topic'] = sorted_output[:, 1]
df_topics['Third_topic'] = sorted_output[:, 2]
df_topics['Tags'] = df['Tags'].apply(clean_tags).values

print("Extrait de la matrice documents-sujets :")
df_topics[['First_topic', 'Second_topic', 'Third_topic', 'Tags']].head(20)

Extrait de la matrice documents-sujets :


KeyError: "['Test'] not in index"

In [27]:
def show_topics(vectorizer, lda_model, n_words=5):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    i = 0
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
        i = i+1
    return topic_keywords

#topic_keywords = show_topics(cv, lda, n_words=10)        
topic_keywords = show_topics(fv, lda, n_words=10)        
df_keywords = pd.DataFrame(topic_keywords)
df_keywords.columns = [f'Terme {i+1}' for i in range(df_keywords.shape[1])]
df_keywords.index = [f'Sujet {i+1}' for i in range(df_keywords.shape[0])]
df_keywords

Unnamed: 0,Terme 1,Terme 2,Terme 3,Terme 4,Terme 5,Terme 6,Terme 7,Terme 8,Terme 9,Terme 10
Sujet 1,product,price,products,bar,foo,stock,typename,quantity,carousel,encrypt
Sujet 2,login,redirect,auth,password,user,session,cookie,oauth,authorization,flask
Sujet 3,train,model,shape,tensorflow,keras,layer,msg,fit,numpy,reduce
Sujet 4,category,httpd,west,complexity,album,digital,magento,imagepath,forums,applewebkit
Sujet 5,std,struct,stripe,rust,impl,pub,trait,europe,variants,unwrap
...,...,...,...,...,...,...,...,...,...,...
Sujet 71,multiprocessing,cuda,scrollbar,environ,shortest,prefetch,preferences,preference,preferably,preferable
Sujet 72,aaa,preference,preferably,preferable,prefer,pref,predictor,predictions,prediction,predict
Sujet 73,dataframe,row,column,sum,date,pandas,columns,matrix,datetime,group
Sujet 74,aaa,preference,preferably,preferable,prefer,pref,predictor,predictions,prediction,predict


In [12]:
pyLDAvis.enable_notebook()
# panel = pyLDAvis.sklearn.prepare(lda, text_tfidf, fv, mds='tsne')
panel = pyLDAvis.sklearn.prepare(lda, text_tfidf, fv, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Analyse non supervisée : LSA

In [None]:
fv = TfidfVectorizer(analyzer='word', min_df=20, max_df=0.3)
text_tfidf = fv.fit_transform(df['Text'])
print(f"Taille de la matrice Tfidf : {text_tfidf.shape}")

In [None]:
svd_model = sk.decomposition.TruncatedSVD(n_components=75, 
                                          algorithm='randomized', 
                                          n_iter=100, 
                                          random_state=SEED)
svd_model.fit(text_tfidf)
len(svd_model.components_)

In [None]:
def show_topics(vectorizer, model, n_words=5):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    i = 0
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
        i = i+1
    return topic_keywords

topic_keywords = show_topics(fv, svd_model, n_words=10)        
df_keywords = pd.DataFrame(topic_keywords)
df_keywords.columns = [f'Terme {i+1}' for i in range(df_keywords.shape[1])]
df_keywords.index = [f'Sujet {i+1}' for i in range(df_keywords.shape[0])]
df_keywords

## Analyse par plongement lexical [pas une priorité]

In [None]:
model = spacy.load("en_core_web_md")

In [None]:
doc = model("This is some text that I am processing with Spacy")
len(doc[3].vector)
type(doc), type(doc[3]), type(doc[3].vector)