# Catégorisation automatique de questions

## Initialisation

In [101]:
# Standard libraries
import os
import re
import sys

# External libraries
import bs4
import matplotlib 
import matplotlib.pyplot as plt
import nltk
import nltk.stem.porter
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import spacy
import sklearn as sk
import sklearn.decomposition
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.feature_selection
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.svm
import string

FILENAME = "P6 SO latest 50k.csv"

# Effacer les colonnes inutilisées pour libérer de la mémoire ?
DEL_UNUSED_COLS = False

SEED = 1911

In [2]:
df = pd.read_csv(FILENAME)
print(f"Taille des données : {df.shape}")
df.head(10)

Taille des données : (50000, 6)


Unnamed: 0,Id,CreationDate,Score,Body,Title,Tags
0,57560001,2019-08-19 15:43:13,0,<p>found the answer!</p>\n\n<pre><code>rdd = s...,,
1,57560002,2019-08-19 15:43:15,1,<p>I am trying to get the S3 Bucket size in aw...,Python Script to get the size of the S3 Bucket...,<python><amazon-web-services><amazon-s3><boto3...
2,57560003,2019-08-19 15:43:15,0,<p>I am trying to set up the virtual environme...,"The command "".\Scripts\activate"" is not working",<python>
3,57560005,2019-08-19 15:43:20,0,"<p>As we all know, we can automagicaly generat...",Is Java annotation processor capable of removi...,<java><annotations><conditional-compilation>
4,57560006,2019-08-19 15:43:25,0,<p>The GoDaddy package my friend got doesn't h...,Multer-sftp won't upload over port 21,<node.js><upload><sftp><multer>
5,57560007,2019-08-19 15:43:26,0,<p>Maybe it works when you initialize the OnCl...,,
6,57560008,2019-08-19 15:43:28,0,<p>I was coding these past two weeks and it wa...,Android studio all xml layout broke and now co...,<android><android-layout>
7,57560009,2019-08-19 15:43:32,1,"<p>According to you document sample, <code>con...",,
8,57560010,2019-08-19 15:43:33,3,"<p>Create a lookup table:</p>\n\n<p><a href=""h...",,
9,57560012,2019-08-19 15:43:36,-1,<p>I have a table I queried from SQL:</p>\n\n<...,Pandas - transform SQL style dataframe to columns,<pandas>


In [3]:
print("Nombre de valeurs manquantes par colonne :")
df.isna().sum(axis=0)

Nombre de valeurs manquantes par colonne :


Id                  0
CreationDate        0
Score               0
Body               30
Title           25558
Tags            25558
dtype: int64

In [4]:
mask = df['Tags'].isna()
print(f"Suppression de {sum(mask)} lignes.")
df = df[~mask]
print(f"Nouvelle taille des données : {df.shape}")

Suppression de 25558 lignes.
Nouvelle taille des données : (24442, 6)


## Analyse initiale

### Traitement des étiquettes

In [5]:
def tokenize_tags(t):
    tags_list = re.findall('\<(.*?)\>', t)   
    return tags_list

In [6]:
cv = CountVectorizer(analyzer='word', tokenizer=tokenize_tags)
tags = cv.fit_transform(df['Tags'])
print(f"Taille de la matrice de labels : {tags.shape}")

Taille de la matrice de labels : (24442, 9414)


In [7]:
type(print(f"Nombre d'étiquettes distinctes : {len(cv.get_feature_names())}"))
print(f"Nombre d'étiquettes attribuées : {tags.sum()}")

Nombre d'étiquettes distinctes : 9414
Nombre d'étiquettes attribuées : 69273


In [8]:
tags_s = pd.Series(np.squeeze(np.asarray(tags.sum(axis=0))), 
                   index=cv.get_feature_names())
tags_s.sort_values(inplace=True, ascending=False)
print("Labels les plus fréquents :")
tags_s.head(20)

Labels les plus fréquents :


python        2890
javascript    2580
java          1601
c#            1388
android       1241
php           1071
python-3.x     949
html           941
reactjs        768
r              699
node.js        653
angular        630
css            617
sql            585
mysql          517
c++            502
pandas         486
jquery         485
excel          445
swift          405
dtype: int64

In [9]:
print("Labels les moins fréquents :")
tags_s.tail(20)

Labels les moins fréquents :


grocery-crud            1
gravityforms            1
readdir                 1
graylog                 1
greasemonkey            1
greedy                  1
readonly                1
readme                  1
readline                1
grep-indesign           1
grid-layout             1
read-eval-print-loop    1
grid-search             1
gridex                  1
gridextra               1
readability             1
read-the-docs           1
gridsome                1
gridster                1
linqkit                 1
dtype: int64

In [104]:
def get_first_tag(s):
    tags = tokenize_tags(s)
    return tags[0]

df['Tag'] = df['Tags'].apply(get_first_tag)
df['TagCode'] = df['Tag'].factorize()[0]
tags_df = df[['Tag', 'TagCode']].drop_duplicates().sort_values('TagCode')
tag_to_id = dict(tags_df.values)
id_to_tag = dict(tags_df[['TagCode', 'Tag']].values)
df[['Tag', 'TagCode', 'Tags']].head(10)

Unnamed: 0,Tag,TagCode,Tags
1,python,0,<python><amazon-web-services><amazon-s3><boto3...
2,python,0,<python>
3,java,1,<java><annotations><conditional-compilation>
4,node.js,2,<node.js><upload><sftp><multer>
6,android,3,<android><android-layout>
9,pandas,4,<pandas>
14,python,0,<python><bash><git><anaconda><conda>
15,api,5,<api><filenet-p8><filenet-process-engine>
17,vue.js,6,<vue.js><axios>
19,flutter,7,<flutter><dart><flutter-layout>


In [89]:
TRESHOLD = 50
tag_counts = df['Tag'].value_counts() 
print(f"Nombre de labels uniques : {len(tag_counts)}")
print(f"Nombre de labels apparaissant moins de {TRESHOLD} fois : "
      f"{sum(tag_counts < TRESHOLD)}")

Nombre de labels uniques : 1787
Nombre de labels apparaissant moins de 50 fois : 1729


In [90]:
del_labels = tag_counts[tag_counts < TRESHOLD].index.values
del_mask = df['Tag'].isin(del_labels.tolist())
print(f"Nombre de lignes à effacer : {sum(del_mask)} sur {len(del_mask)}")

Nombre de lignes à effacer : 5353 sur 24442


In [13]:
print("Labels restant après simplification :")
" ".join(tag_counts[tag_counts >= TRESHOLD].index.values)

Labels restant après simplification :


'python javascript java c# php android r c++ python-3.x sql angular html node.js ios excel reactjs mysql c swift flutter django css jquery react-native sql-server laravel linux ruby-on-rails amazon-web-services docker azure powershell wordpress typescript json git vue.js spring-boot bash mongodb regex scala windows postgresql spring asp.net go arrays oracle vba apache-spark vb.net google-apps-script firebase elasticsearch kubernetes jenkins ruby'

### Préparation du texte

In [14]:
stop_words = set(nltk.corpus.stopwords.words("english"))
new_words = ["using", "trying", "running", "want", "except", "guys", "get", 
             "code", "run", "might", "tried", "whenever", "current", "gives", 
             "name", "try", "must", "know", "looks", "problem", "problems",
             "anyone", "without", "the"]
stop_words = stop_words.union(new_words)
stop_words = stop_words.union(list(string.punctuation))

In [15]:
replace_list = [('vt100', 'vthundred'),
                ('port 25', 'porttwentyfive'),
                ('port 8000', 'porteightthousand'),
                ('port 8080', 'porteightyeighty'),
                ('2D', 'twodim'),
                ('3D', 'threedim'),
                ('vt100', 'vthundred'),
                ('c++11', 'cppeleven'),
                ('c++', 'cpp'),
                ('g++', 'gpp'),
                ('s3', 'sthree'),
                ('x64', 'xsixtyfour')
               ]

regex_list = [(r'\br\b', 'RSoftware'),
              (r'\s.net\b', 'dotnet')]

def replace_words_containing_non_alphas(s):
    for w1, w2 in replace_list:
        s = s.replace(w1, w2)
    for w1, w2 in regex_list:
        s = re.sub(w1, w2, s)
    return s
                
s = "The SMTP protocol is assigned the port 25."
replace_words_containing_non_alphas(s)

'The SMTP protocol is assigned the porttwentyfive.'

In [16]:
def tokenize_text(s): 
    # Suppression des balises HTML
    cleaner = lambda text: bs4.BeautifulSoup(text, 'html.parser').get_text()
    s = cleaner(s).lower()
    # Élimine chiffres et caractères spéciaux des mots importants
    s = replace_words_containing_non_alphas(s)
    # Suppression des caractères non alphabétiques
    s = re.sub(r'[^a-zA-Z]', ' ', s)    
    # Suppression des mots de 1 ou 2 lettres
    s = re.sub(r'\b[a-z]{1,2}\b', '', s)
    # Suppression des mots dans `stop_words`
    s = [w for w in nltk.tokenize.word_tokenize(s) if w not in stop_words]    
    # Lemmatisation
    stemmed = []
    lemmatizer = nltk.stem.WordNetLemmatizer() 
    for item in s:
        stemmed.append(lemmatizer.lemmatize(item, pos='v'))
    return ' '.join(stemmed)    

example = "The time zone (+10) was configured under Regional Settings."\
          "C++11 is a version of the standard for the language C++."\
          "The R website is http://www.r-project.net. "\
          ".NET Framework (pronounced as dot net) is a software framework"
tokenize_text(example)

'time zone configure regional settings cppeleven version standard language cpp RSoftware website http www RSoftware project net dotnet framework pronounce dot net software framework'

In [17]:
df['Text'] = (df['Title'] + ' ' + df['Body']).apply(tokenize_text)
df[['Title', 'Body', 'Text']].to_csv('processed_text.csv', header=False)
df.head(20)

Unnamed: 0,Id,CreationDate,Score,Body,Title,Tags,Tag,Text
1,57560002,2019-08-19 15:43:15,1,<p>I am trying to get the S3 Bucket size in aw...,Python Script to get the size of the S3 Bucket...,<python><amazon-web-services><amazon-s3><boto3...,python,python script size sthree bucket csv sthree bu...
2,57560003,2019-08-19 15:43:15,0,<p>I am trying to set up the virtual environme...,"The command "".\Scripts\activate"" is not working",<python>,python,command script activate work set virtual envir...
3,57560005,2019-08-19 15:43:20,0,"<p>As we all know, we can automagicaly generat...",Is Java annotation processor capable of removi...,<java><annotations><conditional-compilation>,java,java annotation processor capable remove annot...
4,57560006,2019-08-19 15:43:25,0,<p>The GoDaddy package my friend got doesn't h...,Multer-sftp won't upload over port 21,<node.js><upload><sftp><multer>,node.js,multer sftp upload port godaddy package friend...
6,57560008,2019-08-19 15:43:28,0,<p>I was coding these past two weeks and it wa...,Android studio all xml layout broke and now co...,<android><android-layout>,android,android studio xml layout break contain java c...
9,57560012,2019-08-19 15:43:36,-1,<p>I have a table I queried from SQL:</p>\n\n<...,Pandas - transform SQL style dataframe to columns,<pandas>,pandas,pandas transform sql style dataframe columns t...
14,57560017,2019-08-19 15:44:05,0,<p>I want to learn Data Science and so have us...,Stuck when setting up to use anaconda with VS ...,<python><bash><git><anaconda><conda>,python,stick set use anaconda integrate git terminal ...
15,57560019,2019-08-19 15:44:16,0,<p>I am confused with the various elements ava...,"What is the difference between queueelement , ...",<api><filenet-p8><filenet-process-engine>,api,difference queueelement stepelement workobject...
17,57560021,2019-08-19 15:44:19,0,"<p>Here is my code: \nSo is this, but now foll...",Why my Vue Js created method not working?,<vue.js><axios>,vue.js,vue create method work follow block app data c...
19,57560023,2019-08-19 15:44:27,0,<p>I am doing the search bar and I want to put...,How to position yourself at the end of a Row?,<flutter><dart><flutter-layout>,flutter,position end row search bar put filter icon en...


In [18]:
# Suppression des colonnes inutilisées
if DEL_UNUSED_COLS:
    df.drop(columns=['Body', 'Title', 'Tags'], inplace=True, errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24442 entries, 1 to 49999
Data columns (total 8 columns):
Id              24442 non-null int64
CreationDate    24442 non-null object
Score           24442 non-null int64
Body            24442 non-null object
Title           24442 non-null object
Tags            24442 non-null object
Tag             24442 non-null object
Text            24442 non-null object
dtypes: int64(2), object(6)
memory usage: 1.7+ MB


## Analyse non supervisée : LSA

In [19]:
# On ne conserve que les labels fréquents
df1 = df[~del_mask].copy()
# Utilisation de `min_df` nécessaire pour réduire la quantité de mots
fv = TfidfVectorizer(analyzer='word', min_df=0.001)
body_tfidf = fv.fit_transform(df1['Text'])
print(f"Taille de la matrice Tfidf : {body_tfidf.shape}")

Taille de la matrice Tfidf : (19089, 4168)


In [20]:
svd_model = sk.decomposition.TruncatedSVD(n_components=20, 
                                          algorithm='randomized', 
                                          n_iter=100, 
                                          random_state=122)
svd_model.fit(body_tfidf)
len(svd_model.components_)

20

## Classement mono-label

### Représentation des messages en matrice TFIDF

In [111]:
# On ne conserve que les labels fréquents
df1 = df[~del_mask].copy()
# Utilisation de `min_df` nécessaire pour réduire la quantité de mots
fv = TfidfVectorizer(analyzer='word', ngram_range=(1,2), 
                     min_df=0.001, stop_words='english')
text_tfidf = fv.fit_transform(df1['Text'])
print(f"Taille de la matrice Tfidf : {body_tfidf.shape}")

Taille de la matrice Tfidf : (19089, 10344)


In [143]:
X = text_tfidf
y = df1['TagCode']
X_tr, X_t, y_tr, y_t = sk.model_selection.train_test_split(X, y, 
                                                           test_size=0.3,
                                                           random_state=SEED)
X_tr.shape, y_tr.shape, X_t.shape, y_t.shape

((13362, 9798), (13362,), (5727, 9798), (5727,))

### Mots et bi-grammes les plus corrélés avec chaque catégorie

In [139]:
n_terms = 3
for label, id in [(id_to_tag[id], id) for id in df1['TagCode'].unique()]:
    chi2 = sk.feature_selection.chi2(text_tfidf.toarray(), 
                                     df1['TagCode'] == id)
    indices = np.argsort(chi2[0])
    feature_names = np.array(fv.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]        
    print(label, ":", unigrams[-n_terms:], bigrams[-n_terms:])

python : ['dataframe', 'self', 'python'] ['nan nan', 'lib python', 'site package']
java : ['org', 'println', 'java'] ['org apache', 'override public', 'public void']
node.js : ['mongoose', 'node', 'nodejs'] ['process env', 'function err', 'req res']
android : ['gradle', 'layout', 'android'] ['android app', 'android studio', 'android layout']
vue.js : ['vuetify', 'nuxt', 'vue'] ['div template', 'vue use', 'store state']
flutter : ['dart', 'widget', 'flutter'] ['widget build', 'children widget', 'buildcontext context']
c++ : ['cout', 'cpp', 'std'] ['include iostream', 'std cout', 'std string']
javascript : ['console', 'var', 'javascript'] ['script src', 'document getelementbyid', 'console log']
ruby-on-rails : ['activerecord', 'ruby', 'rail'] ['end end', 'end def', 'ruby rail']
css : ['flex', 'menu', 'css'] ['max width', 'css file', 'text decoration']
django : ['charfield', 'queryset', 'django'] ['charfield max', 'model charfield', 'rest framework']
c# : ['wpf', 'writeline', 'public'] ['

### Modèle bayésien naïf multinomial 

In [145]:
model = sk.naive_bayes.MultinomialNB()
model.fit(X_tr, y_tr)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [149]:
def print_scores(model, X, y, y_pred):
    score = sk.metrics.recall_score(y, y_pred, average='micro')    
    print(f"Rappel = {score:0.2f}")
    score = sk.metrics.precision_score(y, y_pred, average='micro')
    print(f"Précision = {score:0.2f}")
    score = sk.metrics.f1_score(y, y_pred, average='micro')
    print(f"Justesse = {score:0.2f}")
    score = sk.metrics.accuracy_score(y, y_pred)    
    print(f"F1 = {score:0.2f}")
    print()

In [150]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(model, X_tr, y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(model, X_t, y_t, y_pred)

*** Scores sur données d'entraînement
Rappel = 0.47
Précision = 0.47
Justesse = 0.47
F1 = 0.47

*** Scores sur données de test :
Rappel = 0.43
Précision = 0.43
Justesse = 0.43
F1 = 0.43



In [27]:
print(sk.metrics.classification_report(y_t, y_pred, 
                                       target_names=df1['Tag'].cat.categories))

  'precision', 'predicted', average, warn_for)


                     precision    recall  f1-score   support

amazon-web-services       0.00      0.00      0.00        45
            android       0.68      0.48      0.56       292
            angular       0.33      0.01      0.02       124
       apache-spark       0.00      0.00      0.00        16
             arrays       0.00      0.00      0.00        15
            asp.net       0.00      0.00      0.00        20
              azure       0.00      0.00      0.00        38
               bash       0.00      0.00      0.00        31
                  c       0.00      0.00      0.00        67
                 c#       0.57      0.59      0.58       396
                c++       0.81      0.35      0.49       144
                css       0.00      0.00      0.00        55
             django       0.00      0.00      0.00        47
             docker       1.00      0.03      0.05        37
      elasticsearch       0.00      0.00      0.00        20
              excel    

In [28]:
row = 0
v = body_tfidf[row, :]
predicted = model.predict(v)[0]
expected = df1.iloc[row]['TagCode']
print(f"Labels prédit et attendu pour ligne {row} : {predicted, expected}")

Labels prédit et attendu pour ligne 0 : (38, 38)


### Régression logistique

In [79]:
model = sk.linear_model.LogisticRegression(multi_class='multinomial', 
                                           solver='sag')
model.fit(X_tr, y_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [80]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(model, X_tr, y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(model, X_t, y_t, y_pred)

*** Scores sur données d'entraînement
Rappel = 0.70
Précision = 0.70
F1 = 0.70

*** Scores sur données de test :
Rappel = 0.58
Précision = 0.58
F1 = 0.58



In [31]:
print(sk.metrics.classification_report(y_t, y_pred, 
                                       target_names=df1['Tag'].cat.categories))

                     precision    recall  f1-score   support

amazon-web-services       0.66      0.47      0.55        45
            android       0.68      0.66      0.67       292
            angular       0.70      0.44      0.54       124
       apache-spark       0.67      0.12      0.21        16
             arrays       0.00      0.00      0.00        15
            asp.net       0.00      0.00      0.00        20
              azure       0.59      0.26      0.36        38
               bash       1.00      0.10      0.18        31
                  c       0.83      0.36      0.50        67
                 c#       0.53      0.70      0.60       396
                c++       0.84      0.72      0.78       144
                css       0.33      0.05      0.09        55
             django       0.38      0.13      0.19        47
             docker       0.78      0.49      0.60        37
      elasticsearch       0.71      0.25      0.37        20
              excel    

### SVM linéaire

#### Optimisation des hyperparamètres

In [87]:
base = sk.svm.LinearSVC(dual=False, random_state=SEED)
grid = {'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
#        'max_iter': [1000, 2000, 5000]
       }
model = sk.model_selection.GridSearchCV(base,
                                        param_grid=grid,
                                        cv=5,
                                        n_jobs=-1)
model.fit(X_tr, y_tr)
model.best_params_

{'C': 1, 'penalty': 'l1'}

#### Entraînement du modèle

In [81]:
model = sk.svm.LinearSVC(C=1.0, penalty='l1', max_iter=1000, dual=False)
model.fit(X_tr, y_tr)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5000,
          multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
          verbose=0)

In [83]:
print("*** Scores sur données d'entraînement")
y_pred = model.predict(X_tr)
print_scores(model, X_tr, y_tr, y_pred)
print("*** Scores sur données de test :")
y_pred = model.predict(X_t)
print_scores(model, X_t, y_t, y_pred)

*** Scores sur données d'entraînement
Rappel = 0.84
Précision = 0.84
F1 = 0.84

*** Scores sur données de test :
Rappel = 0.62
Précision = 0.62
F1 = 0.62



## Analyse non supervisée : LDA

In [32]:
cv = CountVectorizer()
body_words = cv.fit_transform(df['Text'])
print(f"Taille de la matrice de vocabulaire : {body_words.shape}")

Taille de la matrice de vocabulaire : (24442, 127979)


Sans radicalisation : (24442, 361549)

In [33]:
# TODO : optimisation des hyperparamètres

In [34]:
lda = sk.decomposition.LatentDirichletAllocation(n_components=50, 
                                                 learning_method='online',
                                                 random_state=SEED)
lda.fit(body_words)
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=50, n_jobs=None,
                          perp_tol=0.1, random_state=1911,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [35]:
n_keywords = 15
words = cv.get_feature_names()
for i, topic in enumerate(lda.components_):
    keywords = [words[i] for i in topic.argsort()[:-n_keywords-1:-1]]
    print(i, " ".join(keywords))
# afficher des examples

0 command bin pip tex ubuntu keycloak exit buildevents buildexceptionreporter wheel sessionid systemd sdist error pickerview
1 value column option select character change number options list iterator name sudo integer array replace
2 twitter mutate chr patient lag enc risk zoo elementname simpledateformat lastrow xmlelement men dompdf dbl
3 file line python package lib users site module local program usr error txt year library
4 location param queue parameter lead blog area dashboard mod graphql course clock apollo htaccess paypal
5 logger companyid alloc tier assistant createddate lcd fixtures outfile sendmail itemscontrol binder postalcode responsedata viewstate
6 err req catalina mongoose gems mainwindow qtwidgets qtcore passport gem linestyle addwidget bokeh pyqt sftp
7 three transform game material scene texture unity assets vscode genre mesh coupon budget gameobject tinymce
8 grid box dart idx postgres shop customers transparent colour jupyter notebook buf viewmodel tkinter qty
9

## Analyse par plongement lexical

In [36]:
model = spacy.load("en_core_web_md")

In [37]:
doc = model("This is some text that I am processing with Spacy")
len(doc[3].vector)
type(doc), type(doc[3]), type(doc[3].vector)

(spacy.tokens.doc.Doc, spacy.tokens.token.Token, numpy.ndarray)