# **Topic Modeling avec BERTopic**

Auteurs : Tom LABIAUSSE - Pierre Ollivier - Amine CHERIF HAOUAT - Cyrine NABI

# 1. Imports & Connexion GG Drive


*   A chaque utilisation du notebook, il faut installer la libraire BERTopic avec la commande : *!pip install BERTopic*.
*   Pour ce faire, lancez une fois la cellule ci-dessous. Puis redémarrez le *runtime* comme demandé et relancez la cellule.

In [None]:
!pip install BERTopic

In [None]:
!pip install scikit-learn==1.0.1 # Version compatible avec BERTopic

In [None]:
from bertopic import BERTopic

In [None]:
# Modules classiques
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import cnames as dico_colors
COLOR_NAMES = list(dico_colors.keys())
import pickle
import random
import xlwt

# Modules pour NLP
from sentence_transformers import SentenceTransformer
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
# from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
import scipy.cluster.hierarchy as hcluster
import umap
import hdbscan

# Depuis sklearn
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Connexion à un Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
path_to_GGDrive = "/content/gdrive/MyDrive/"
print("> Connexion à Google Drive OK")

# 2. Chargement des données

Définition de la fonction *get_dataframe* permettant de charger une base de données Excel dans une dataframe Python.

In [None]:
def get_dataframe(file_name, column_text, column_title="Titre", lower=True, to_spaces=[], min_size=20, max_size=5000, verbose=True):
    """ Charge la base de données depuis un fichier excel dans une dataframe python en effectuant un pré-traitement de base :
        - suppression des lignes vides
        - remplacement des elements de 'to_spaces' presents dans les resumes par des espace (ex : "\n" ou "\t")
        - "lower=True" passe tous les resumes en minuscule """
    input_df = pd.read_excel(file_name)
    df = input_df[['EAN', column_title, column_text]]

    # Renommage des colonnes (column_title -> titre ; column_text -> texte)
    df.columns = ['EAN','titre','texte']

    # Suppression des lignes vides
    df = df.dropna().reset_index(drop=True)

    # Ajout d'une colonne avec la taille des textes
    df['taille_texte'] = 0 # valeur par defaut

    # Suppression des sequences de 'to_spaces' dans les textes
    for k in range(0,df.shape[0]):
        for elt in to_spaces:
            df.iloc[k,2] = df.iloc[k,2].replace(elt," ").strip(" ") # texte
            df.iloc[k,1] = df.iloc[k,1].replace(elt," ").strip(" ") # texte
        if lower:
            df.iloc[k,2] = df.iloc[k,2].lower() # texte
            df.iloc[k,1] = df.iloc[k,1].lower() # titre
        df.iloc[k,3] = len(df.iloc[k,2])
    
    # Suppression des textes de taille incoherentes
    min_ind = df.loc[df['taille_texte']<min_size].index
    df.drop(min_ind, inplace=True)
    max_ind = df.loc[df['taille_texte']>max_size].index
    df.drop(max_ind, inplace=True)

    if verbose:
      print('# Dataframe "{0}" de taille {1} chargee.'.format(file_name,df.shape))
      print("> {0} textes de taille <{1} ou >{2} supprimes\n".format(len(min_ind)+len(max_ind),min_size,max_size))
      df

    return(df)

Définition de la fonction *merge_dataframe* permettant de concatener différentes dataframes Python.

In [None]:
def merge_dataframe(df_list):
    """ Fusion des dataframes de 'df_list' dans la dataframe 'merged_df'. """
    if len(df_list) == 0:
        print("Aucune dataframe.")
        merged_df = None
    else:
        merged_df = df_list[0]
        for k in range(1,len(df_list)):
            merged_df = merged_df.append(df_list[k], ignore_index=True)
        print("# Fusion des corpus dans une dataframe de taille {0}".format(merged_df.shape))
    return(merged_df)

Chargement des dataframes et construction du dataset fusionné.

In [None]:
corpus_num = [1,2]
to_spaces = ["\n","\t","&nbsp;"]
corpus_list = []

if 1 in corpus_num: # CORPUS 1
    corpus1 = get_dataframe(path_to_GGDrive + 'corpus 1.xlsx', column_text="Description sans html", to_spaces=to_spaces)
    corpus_list.append(corpus1)

if 2 in corpus_num: # CORPUS 2
    corpus2 = get_dataframe(path_to_GGDrive + 'corpus 2.xlsx', column_text="Description", to_spaces=to_spaces)
    corpus_list.append(corpus2)

if 3 in corpus_num: # CORPUS 3
    corpus3 = get_dataframe(path_to_GGDrive + 'corpus 3.xlsx', column_text="resume", to_spaces=to_spaces)
    corpus_list.append(corpus3)

# Concatenation des corpus
data = merge_dataframe(corpus_list)

Définition de la fonction *get_docs* permettant de récupérer les données pour l'entraînement (concaténation éventuelle des titres aux textes)

In [None]:
def get_docs(data, add_titles=False):
    """ Récupère les textes et ajoute concatene les titres si besoin. """
    docs = data["texte"].to_list() ; titles_ = data["titre"].to_list()
    if add_titles:
        for k in range(0,len(texts_)):
            docs[k] = docs[k] + ". " + titles_[k]
    del(titles_)
    print("> Nombre de textes : ",len(docs))
    return(docs)

In [None]:
# DONNEES D'ENTRAINEMENT
docs = get_docs(data, add_titles=False)

# 3. Algorithme BERTopic

## 3.1 Entraîner un modèle BERTopic sur *docs*

### 3.1.1 Transformer CamemBERT

In [None]:
# Installation de la librairie flair
!pip install flair

In [None]:
# Import d'un document-Transformer
from flair.embeddings import TransformerDocumentEmbeddings

Initialisation d'un modèle BERT avec CamemBERT




In [None]:
camembert = TransformerDocumentEmbeddings('camembert-base')
BERT_model = BERTopic(embedding_model=camembert)

### 3.1.2 Transformer BERT

Initialisation d'un modèle BERT avec BERT (from Google)

*   language = *french* ou *multilingual*
*   verbose = True : affiche les details de l'execution
*   nr_topics = *None*, *'auto'* ou un nombre de topics fixé

In [None]:
BERT_model = BERTopic(language="multilingual", verbose=True, nr_topics=None)

### 3.1.3 Application de BERTopic au corpus *docs* (2/3 min)

In [None]:
doc2topics, probabilities = BERT_model.fit_transform(docs)
doc2topics = np.array(doc2topics)

## 3.2 Charger/Enregistrer un modèle BERTopic

Nom du fichier d'enregistrement/sauvegarde du modèle

In [None]:
model_file_name = "BERTopic_#1+2_t30_04-01-22"

Définition des fonctions de sauvegarde/téléchargement des fichiers de correspondances :

In [None]:
# DOC -> TOPIC

def save_doc2topic(model_file_name,np_topics):
    """ Fonction de sauvegarde des correspondances document-topic dans un fichier .txt """
    with open(path_to_GGDrive + "BERT_models/" + model_file_name + "_Doc2Topic.txt",'wb') as fichier:
        outil = pickle.Pickler(fichier)
        outil.dump(np_topics)
    print("> Doc2Topic file '{0}' saved in '{1}'".format(model_file_name+"_Doc2Topic.txt",path_to_GGDrive + "BERT_models/"))

def load_doc2topic(txt_file):
    """ Fonction de telechargement des correspondances document-topic depuis un fichier .txt """
    with open(path_to_GGDrive + "BERT_models/" + model_file_name + "_Doc2Topic.txt",'rb') as fichier:
        outil = pickle.Unpickler(fichier)
        topics = outil.load()
    return(topics)

In [None]:
# TOPIC -> BIG TOPIC

def save_topic2bigtopic(model_file_name,topic_clustering):
    """ Fonction de sauvegarde des correspondances topic_bigtopic dans un fichier .txt """
    with open(path_to_GGDrive + "BERT_models/" + model_file_name + "_Topic2Bigtopic.txt",'wb') as fichier:
        outil = pickle.Pickler(fichier)
        outil.dump(topic_clustering)
    print("> Topic2Bigtopic file '{0}' saved in '{1}'".format(model_file_name+"_Topic2Bigtopic.txt",path_to_GGDrive + "BERT_models/"))

def load_topic2bigtopic(model_file_name):
    """ Fonction de chargement des correspondances topic_bigtopic depuis un fichier .txt """
    with open(path_to_GGDrive + "BERT_models/" + model_file_name + "_Topic2Bigtopic.txt",'rb') as fichier:
        outil = pickle.Unpickler(fichier)
        topic_clustering = outil.load()
    return(topic_clustering)

In [None]:
# DOC -> BIG TOPCIC

def save_doc2bigtopic(model_file_name,doc2bigtopics):
    """ Fonction de sauvegarde des correspondances docs-bigtopics dans un fichier .txt """
    with open(path_to_GGDrive + "BERT_models/" + model_file_name + "_Doc2Bigtopic.txt",'wb') as fichier:
        outil = pickle.Pickler(fichier)
        outil.dump(doc2bigtopics)
    print("> Topic2Bigtopic file '{0}' saved in '{1}'".format(model_file_name+"_Doc2Bigtopic.txt",path_to_GGDrive + "BERT_models/"))

def load_doc2bigtopic(model_file_name):
    """ Fonction de chargement des correspondances docs-bigtopics depuis un fichier .txt """
    with open(path_to_GGDrive + "BERT_models/" + model_file_name + "_Doc2Bigtopic.txt",'rb') as fichier:
        outil = pickle.Unpickler(fichier)
        doc2bigtopics = outil.load()
    return(doc2bigtopics)

Enregistrement dans *BERT_model*

In [None]:
# Sauvegarde du modèle BERTopic contenu dans model_file_name
BERT_model.save(path_to_GGDrive + "BERT_models/" + model_file_name)
print("> Model '{0}' saved in '{1}'".format(model_file_name,path_to_GGDrive + "BERT_models/"))

In [None]:
# Sauvegarde des correspondances document-topic
save_doc2topic(model_file_name,doc2topics)

Chargement dans *BERT_model*

In [None]:
# Chargement du modèle BERTopic contenu dans model_file_name
BERT_model = BERTopic.load(path_to_GGDrive + "BERT_models/" + model_file_name)

In [None]:
# Chargement des correspondances document-topic
doc2topics = load_doc2topic(model_file_name)

In [None]:
# Chargement d'éventuelles correspondances topic-bigtopic
topic_clustering = load_topic2bigtopic(model_file_name)

In [None]:
# Chargement d'éventuelles correspondances doc-bigtopic
doc2bigtopics = load_doc2bigtopic(model_file_name)

# 4. Visualisation des résultats

Aperçu général des topics

In [None]:
BERT_model.get_topic_info()

Aperçu des résultats

In [None]:
nb_words = 10
print("#=================================================== TOPICS ===================================================#")
for ind in range(0,BERT_model.get_topic_info().shape[0]):
    print( "TOPIC {0} -> ".format(ind-1) + " ".join([word for [word,_] in BERT_model.get_topic(ind-1)][:nb_words]) )
print("")

Répartition générale des documents dans les topics

In [None]:
def show_sizes(n_clusters, doc2clusters, show_outliers=False):
    """ Affiche le nombre de texte de chaque cluster (cluster = topic ou big topic) dans la repartition doc2cluster (doc2cluster = doc2topics ou doc2bigtopics) """
    c_sizes = [ [c,len(np.where(doc2clusters == c)[0])] for c in range(-1,n_clusters) ]
    c_sizes = np.array(sorted(c_sizes, key=lambda x : x[1], reverse = True))
    if show_outliers:
        plt.plot(range(-1,n_clusters),c_sizes[:,1])
    else:
        plt.plot(range(0,n_clusters),c_sizes[1:,1])
    plt.title('Répartition de {0} textes dans les {1} clusters'.format(len(doc2clusters)-c_sizes[0,1],n_clusters))
    plt.xlabel("Clusters") ; plt.ylabel("Nombre de textes")
    plt.show()

In [None]:
show_sizes(BERT_model.get_topic_info().shape[0]-1, doc2topics)

Affichage des documents appartenant à un topic

In [None]:
def print_docs_in_topic(t):
    """ Affiche les documents appartenant au topic 't.' """
    docs_per_topic = [ list(np.where(doc2topics==k)[0]) for k in range(-1,np.max(doc2topics)+1)]
    print( "# TOPIC {0} : ".format(t) +  " ".join([elt[0] for elt in BERT_model.get_topic(t)]) ) ; print("")
    for doc_id in docs_per_topic[t+1]:
        print("[doc ID : {0}] - {1}".format(doc_id,docs[doc_id]))

In [None]:
print_docs_in_topic(96)

Intertopic Distance Map

In [None]:
BERT_model.visualize_topics()

In [None]:
# Sauvegarder l'Intertopic Distance Map interactive
BERT_model.visualize_topics().write_html(path_to_GGDrive + "BERT_models/" + model_file_name + "_IDM.html")

Identification de topic par mot-clef

In [None]:
word = "crayon"

close_topics, similarity = BERT_model.find_topics(word, top_n=5)
for t,s in zip(close_topics,similarity):
    print("- {0}% = TOPIC {1} : ".format(int(100*s),t) +  " ".join([elt[0] for elt in BERT_model.get_topic(t)]) )

Classification d'un nouveau texte

In [None]:
txt_test = """
L'ère de l'intelligence artificielle est déjà là ! Les robots vont tous nous remplacer parce que les ordinateurs sont plus rapides que les humains.
"""

[topic_test],proba_test = BERT_model.transform(txt_test)
print("TEXTE : " + txt_test)
print( "PREDICTION : {0}% - TOPIC {1} : ".format(proba_test,topic_test) +  " ".join([elt[0] for elt in BERT_model.get_topic(topic_test)]) )

# 5. Evaluation des résultats

Le code suivant ecrit par Maarten Grootendorst est disponible à l'adresse : https://github.com/MaartenGr/BERTopic/issues/90

In [None]:
def compute_coherence(docs,topics):
    """ Calcul la mesure de coherence CV sur un modèle BERTopic. """
    # Preprocess Documents
    documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = BERT_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = BERT_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in BERT_model.get_topic(topic)] 
                  for topic in range(len(set(topics))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                    texts=tokens, 
                                    corpus=corpus,
                                    dictionary=dictionary, 
                                    coherence='c_v')
    
    return(coherence_model.get_coherence())

In [None]:
coherence = compute_coherence(docs,doc2topics)
print("Coherence C_v : ",coherence)

# 6. Réduction du nombre de topics (avec les *topic embeddings*)

## 6.1 Réalisation de la réduction sur un modèle BERTopic

*   La méthode suivante s'inspire du code permettant de construire l'intertopic-map disponible à l'adresse : https://github.com/MaartenGr/BERTopic/blob/master/bertopic/plotting/_topics.py

6.1.1 - Projection des topics en 2D avec UMAP

In [None]:
def get_topic_embeddings(bert_model,n_umap=2):
    """ Renvoie les topic embeddings obtenus avec 'bert_model' apres reduction en 'n-umap' dimensions. """
    n_topics = len(list(BERT_model.get_topics().keys()))
    topic_emb = bert_model.c_tf_idf.toarray()[np.arange(0,n_topics)]
    topic_emb = MinMaxScaler().fit_transform(topic_emb)
    topic_emb = umap.UMAP(n_neighbors=2, n_components=n_umap, metric='hellinger').fit_transform(topic_emb)
    return(topic_emb)

In [None]:
topic_emb = get_topic_embeddings(BERT_model)[1:] # On ne s'interesse pas au topic -1

6.1.2 Réalisation d'un clustering hiérarchique sur les topics (choix du *threshold* par optimisation du score de silhouette)

In [None]:
# Calcul des coef de silhouette pour plusieurs threshold
many_thld = np.arange(0.2,4.0,step=0.1).round(2)
s_scores =[]
for thld in many_thld:
    t_clustering = hcluster.fclusterdata(topic_emb, thld, criterion="distance")-1
    s_scores.append(silhouette_score(topic_emb, t_clustering))

# Affichage de l'evolution du coef de silhouette avec le threshold
s_scores = np.array(s_scores)
plt.plot(many_thld,s_scores)
plt.title("Evolution du score de silhouette du clustering de topcis")
plt.show()

# Meilleur(s) threshold(s)
best_score = np.max(s_scores)
best_ind = list(np.where(s_scores==best_score)[0])
best_thlds = [many_thld[i] for i in best_ind]
print("Best silhouette_score = {0} with threshold in {1}".format(best_score,best_thlds))

# Choix d'un threshold et clustering final
best_thld = best_thlds[len(best_thlds)//2]
print("> Choosen threshold : {0}".format(best_thld))
topic_clustering = hcluster.fclusterdata(topic_emb, best_thld, criterion="distance")-1

6.1.3 Affichage du clustering sur la projection des topics en 2D

In [None]:
def plot_topics(top_emb, clustering=[], dcolor="blue"):
    """ Affiche une representation des topics projetes en 2D avec visualisation des clusters par couleurs si 'clustering' precise. """
    n_topics = top_emb.shape[0]
    x_top_emb = topic_emb[:,0].reshape(-1)
    y_top_emb = topic_emb[:,1].reshape(-1)
    if len(clustering) == 0: # No clustering
        clustering = np.zeros(n_topics)
        top_colors = [dcolor for k in range(n_topics)]
        plt.scatter(x_top_emb, y_top_emb, c=top_colors, marker="^")
        plt.title("Topic embeddings en 2D") ; plt.show()
    else:
        n_clusters = len(np.unique(clustering))
        colors = random.sample(COLOR_NAMES,n_clusters)
        top_colors = [colors[int(clust)] for clust in clustering]
        plt.scatter(x_top_emb, y_top_emb, c=top_colors, marker="^")
        plt.title("Topic embeddings en 2D [#{0}>>#{1}]".format(n_topics,n_clusters)) ; plt.show()

In [None]:
plot_topics(topic_emb,topic_clustering)

6.1.4 - Association de chaque texte a un big topic. On définit le big topic -1 comme étant exactement le topic -1 (big topic des outliers).

In [None]:
doc2bigtopics = np.copy(doc2topics)
for k in range(0,len(doc2topics)):
    if doc2topics[k] == -1: # outlier
        doc2bigtopics[k] = -1
    else:
      doc2bigtopics[k] = topic_clustering[doc2topics[k]]
print(doc2bigtopics)

6.1.5 - Sauvegarde des correspondances [textes - big topics] et [topic - big topic]

In [None]:
save_topic2bigtopic(model_file_name,topic_clustering)

In [None]:
save_doc2bigtopic(model_file_name,doc2bigtopics)

## 6.2 - Visualisation des big topics

In [None]:
# Topics triés par BigTopic
big_topics = [list(np.where(topic_clustering==k)[0]) for k in range(0,np.max(topic_clustering)+1)]

6.2.1 Visualisation des textes par big topic

In [None]:
show_sizes(len(big_topics),doc2bigtopics) # Répartition générale des textes par big topic ('get_sizes' definie en section 4. du notebook)

In [None]:
bt_num = 16
nb_txt = 200

print("> BIG TOPIC {0}".format(bt_num))
k = 0 ; i = 0
while (k<len(doc2bigtopics)) and (i<nb_txt):
    if doc2bigtopics[k] == bt_num:
        i+=1 ; print("#{0} : {1}".format(i,docs[k][:150]))
    k+=1

6.2.2 - Visualisation des topics composant les big topics

In [None]:
nb_words = 10
print("#=================================================== TOPICS of BIG TOPICS ===================================================#\n")
for bt in range(0,len(big_topics)):
    btopic = big_topics[bt]
    print("# BIG TOPIC {0} = {1}".format(bt,btopic))
    for t in btopic:
        print( "TOPIC {0} -> ".format(t) + " ".join([word for [word,_] in BERT_model.get_topic(t)][:nb_words]) )
    print("")

6.2.3 - Homogénéisation des big topics (par matrice TF ou TF-iDF)

In [None]:
# Liste des indices de textes appartenant à chaque big topic
docs_per_bigtopic = [ list(np.where(doc2bigtopics==k)[0]) for k in range(0,np.max(doc2bigtopics)+1)]

# Formation d'un super-texte à partir des textes composant chaque big topic
txt_of_bigtopic = np.array([ " ".join( [docs[doc_ind] for doc_ind in docs_per_bigtopic[bt_ind]] ) for bt_ind in range(0,len(big_topics)) ])

In [None]:
# Telechargement d'une liste de stop words français
f = open(path_to_GGDrive + 'stop_words_french.txt') ; fr_stopwords = set(f.read().split('\n')) ; f.close()
print("NB french stopwords :",len(fr_stopwords))

# Méthode de classement des mots TF (Term Frequency)
def c_TF(documents,my_stop_words):
    """ count_matrix = tf """
    count = CountVectorizer(stop_words=my_stop_words).fit(documents)
    t = count.transform(documents).toarray() # num_clusters x num_words
    w = t.sum(axis=1) # num_clusters x 1
    count_matrix = np.divide(t.T, w) # num_words x num_clusters
    return(count_matrix, count)

# Méthode de classement des mots TF_iDF (Term Frequency - inverse Document Frequency)
def c_TF_IDF(documents,my_stop_words):
    """ count_matrix = tf-idf """
    count = CountVectorizer(stop_words=my_stop_words).fit(documents)
    t = count.transform(documents).toarray() # num_clusters x num_words
    w = t.sum(axis=1) # num_clusters x 1
    tf = np.divide(t.T, w) # num_words x num_clusters
    D = len(documents) # num_clusters x 1
    sum_t = t.sum(axis=0) # num_words x 1
    idf = np.log(np.divide(D, sum_t)).reshape(-1, 1) # num_words x 1
    count_matrix = np.multiply(tf, idf) # num_words x num_clusters
    return(count_matrix, count)

def extract_top_n_words_per_topic(tf_idf, count, n_labels, n=10, inf_size=3):
    """ Fonction d'extraction des 'n' mots les plus representatifs d'un texte en ne considerant que les mots de plus de 'inf_size' caracteres. """
    words = count.get_feature_names()
    labels = np.arange(0,n_labels)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    for l in top_n_words.values():
        s = []
        for k in range(0,len(l)):
            if len(l[k][0])<inf_size:
                s.append(k)
        if len(s)>0:
            s.reverse()
            for k in s:
                l.pop(k)
    return(top_n_words)

In [None]:
count_matrix, count = c_TF(txt_of_bigtopic, my_stop_words=fr_stopwords)
top_n_words = extract_top_n_words_per_topic(count_matrix, count, n_labels=len(big_topics), n=20)

In [None]:
nb_words = 10
print("#=================================================== WORDS of BIG TOPICS ===================================================#")
for bt in range(0,len(big_topics)):
    btopic = big_topics[bt]
    # print("# BIG TOPIC {0} = {1}".format(bt,btopic))
    print( " BIG TOPIC {0} -> ".format(bt) + " ".join([word for [word,_] in top_n_words[bt]][:min(nb_words,len(top_n_words[bt]))]) )

# 7. Sauvegarde du Topic Modeling (EXCEL)

Nécessite au minimum les deux éléments suivants :
*   Dataframe des données à l'origine de l'entraînement du modèle : *data*
*   Liste de correspondance docs-topics (*doc2topics*) ou docs-bigtopics (*doc2bigtopics*)



In [None]:
# Données d'entraînement du modèle
data

In [None]:
# Mots descriptifs de chaque topic
nb_words = 10
topic2words = [" ".join([word for [word,_] in BERT_model.get_topic(t)][:nb_words]) for t in range(0,max(doc2topics)+1)]

In [None]:
# Mots descriptifs de chaque big topic (necessite de realiser l'homogeneisation des big topics en amont -> section 6.2.3 du notebook)
nb_words = 10
bigtopic2words = [" ".join([word for [word,_] in top_n_words[bt]][:min(nb_words,len(top_n_words[bt]))]) for bt in range(0,len(top_n_words))]

Définition de la fonction *save_docs_topics* permettant de sauvegarder dans un fichier .xls les correspondances docs-topics ou docs_bigtopics

In [None]:
def save_docs_topcis(topic_type, docs2numtopics, top2words):
    if topic_type in ["topic","bigtopic"]:
        # Creation du fichier excel
        tab = xlwt.Workbook()

        # Feuille 1
        sheet1 = tab.add_sheet('topic_modeling')
        sheet1.write(0, 0, 'EAN') ; sheet1.write(0, 1, 'TITRE') ; sheet1.write(0, 2, 'TEXTE') ; sheet1.write(0, 3, 'TOPIC')
        for k in range(0,data.shape[0]):
            sheet1.write(k+1, 0, int(data.iloc[k,0])) # EAN
            sheet1.write(k+1, 1, data.iloc[k,1]) # TITRE
            sheet1.write(k+1, 2, data.iloc[k,2]) # TEXTE
            sheet1.write(k+1, 3, int(docs2numtopics[k])) # TOPIC

        # Feuille 2
        sheet2 = tab.add_sheet('topic_visualisation')
        sheet2.write(0, 0, 'TOPIC') ; sheet2.write(0, 1, 'DESCRIPTION')
        for k in range(0,len(top2words)):
            sheet2.write(k+1, 0, k) # TOPIC
            sheet2.write(k+1, 1, top2words[k]) # DESCRIPTION
        
        tab.save(path_to_GGDrive + "BERT_models/" + model_file_name + "_DATA_TOPICS.xls")
    else:
        print("ERREUR > topic_type doit être 'topic' ou 'bigtopic'")

In [None]:
save_docs_topcis("bigtopic", doc2bigtopics, bigtopic2words)

# 8. Réduction du nombre de topics (avec la méthode BERTopic)

*   **ATTENTION !** : Réduire le nombre de topic modifie de manière irréversible le modèle entraîné. Impossible de revenir aux anciens topics une fois la réduction effectuée.



8.1 Réduction

In [None]:
# REDUCTION
new_doc2topic_, new_probabilities = BERT_model.reduce_topics(docs, doc2topics, nr_topics=24) # probabilities is optional
new_doc2topic = np.array(new_doc2topic_)
del(new_doc2topic_)

In [None]:
new_doc2topic, _ = BERT_model.reduce_topics(docs, doc2topics, nr_topics=30)

In [None]:
BERT_model.get_topic_info()

In [None]:
# Apercu des topics
nb_words = 10
print("#=================================================== TOPICS ===================================================#")
for ind in range(0,BERT_model.get_topic_info().shape[0]):
    print( "TOPIC {0} -> ".format(ind-1) + " ".join([word for [word,_] in BERT_model.get_topic(ind-1)][:nb_words]) )

In [None]:
# Intertopic distance map
BERT_model.visualize_topics()

8.2 Visualisation TF des mots par topics (nécessite les fonctions *c_TF* et *extract_top_n_words_per_topic* de 6.2)

In [None]:
new_doc2topic = doc2topics

In [None]:
# Liste des indices de textes appartenant à chaque big topic
docs_per_newtopic = [ list(np.where(new_doc2topic==k)[0]) for k in range(0,np.max(new_doc2topic)+1)]

# Formation d'un super-texte pour chaque big topic
txt_per_newtopic = np.array([ " ".join( [docs[doc_ind] for doc_ind in docs_per_newtopic[bt_ind]] ) for bt_ind in range(0,len(docs_per_newtopic)) ])

In [None]:
count_matrix, count = c_TF(txt_per_newtopic, my_stop_words=fr_stopwords)
top_n_words = extract_top_n_words_per_topic(count_matrix, count, n_labels=len(docs_per_newtopic), n=20)

In [None]:
nb_words = 10
print("#=================================================== NEW TOPICS ===================================================#")
for nt in range(0,len(docs_per_newtopic)):
    print( "NEW TOPIC {0}".format(nt) + " -> " + " ".join([word for [word,_] in top_n_words[nt]][:min(nb_words,len(top_n_words[nt]))]) )

8.3 Visualisation des documents par topic

In [None]:
# Aperçu des documents par topic
def print_new_docs_in_topic(t):
    """ Affiche les documents appartenant au topic 't.' """
    new_docs_per_topic = [ list(np.where(new_doc2topic==k)[0]) for k in range(-1,np.max(new_doc2topic)+1)]
    print( "# TOPIC {0} : ".format(t) +  " ".join([elt[0] for elt in BERT_model.get_topic(t)]) ) ; print("")
    for doc_id in new_docs_per_topic[t+1]:
        print("[doc ID : {0}] - {1}".format(doc_id,docs[doc_id]))

In [None]:
print_new_docs_in_topic(17)

8.4 Calcul du score de cohérence

In [None]:
# Evaluation du topic modeling
coherence = compute_coherence(docs,new_doc2topic)
print("Coherence C_v : ",coherence)

8.5 Sauvegarde

In [None]:
model_file_name = "BERTopic_#3_t24_13-01-22"

In [None]:
BERT_model.save(path_to_GGDrive + "BERT_models/" + model_file_name)
print("> Model '{0}' saved in '{1}'".format(model_file_name,path_to_GGDrive + "BERT_models/"))

In [None]:
save_doc2topic(model_file_name,new_doc2topic)

# 9. Algorithme BERT + UMAP + HDBSCAN

## 9.1 Exécution des trois algorithmes successifs

In [None]:
# Creation des document-embeddings
my_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
embeddings = my_model.encode(docs)

In [None]:
# Reduction de la dimensions des projections des documents avec UMAP
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5,
                            min_dist=0.0,
                            metric='cosine').fit_transform(embeddings)

In [None]:
# Realisation du clustering sur les projections d'UMAP avec HDBSCAN
clustering = hdbscan.HDBSCAN(min_cluster_size=50, 
                          metric='euclidean', 
                          cluster_selection_method='eom'
                          ).fit(umap_embeddings)
print(f"Number of clusters = {np.unique(clustering.labels_).shape[0] - 1}")

## 9.2 Aperçu des topics

Taille des topics

In [None]:
# Correspondances document-topic
my_doc2topic = clustering.labels_

# Nombre de documents par topic
topic_size = np.bincount(my_doc2topic+1)
topic_ind = np.arange(-1,len(topic_size)-1)
pd.DataFrame(data = np.vstack((topic_ind,topic_size)).T, columns=['topic','size']).sort_values(by='size',ascending=False)

Mots représentatifs de chaque topic

In [None]:
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [None]:
docs_df = pd.DataFrame(docs, columns=['document'])
docs_df['topic'] = my_doc2topic
docs_per_topic = docs_df.groupby(['topic']).agg({'document': ' '.join})
docs_per_topic = docs_per_topic.reset_index()

In [None]:
def CTfIDF(documents):
    count = CountVectorizer(stop_words=fr_stop).fit(documents)
    t = count.transform(documents).toarray() #num_clusters x num_words
    w = t.sum(axis=1) # num_clusters x 1
    tf = np.divide(t.T, w) #num_words x num_clusters
    D = len(documents) # num_clusters x 1
    sum_t = t.sum(axis=0) #num_words x 1
    idf = np.log(np.divide(D, sum_t)).reshape(-1, 1) # num_words x 1
    tf_idf = np.multiply(tf, idf) #num_words x num_clusters
    return(tf_idf, count)

In [None]:
tf_idf, count = CTfIDF(docs_per_topic['document'].values)

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return(top_n_words)

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=10)

In [None]:
top_n_words

In [None]:
nb_words = 10
print("#=================================================== TOPICS ===================================================#")
for ind in range(0,len(topic_size)):
    print( "TOPIC {0} -> ".format(ind-1) + " ".join([word for [word,_] in top_n_words[ind-1][:nb_words]]) )

## 9.3 Affichage des documents appartenant à un topic

In [None]:
docs_per_topic = [ list(np.where(my_doc2topic==k)[0]) for k in range(-1,np.max(my_doc2topic)+1)]

def print_docs_in_topic_(t):
    """ Affiche les documents appartenant au topic 't.' """
    print( "TOPIC {0} : ".format(t) + " ".join([word for [word,_] in top_n_words[t][:nb_words]]) ) ; print("")
    for doc_id in docs_per_topic[t+1]:
        print("[doc ID : {0}] - {1}".format(doc_id,docs[doc_id]))

In [None]:
print_docs_in_topic_(14)