In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# possible variation de modèle :
# from bertopic.representation import KeyBERTInspired


In [2]:
import pandas as pd
# surveillance des GPU/RAm/etc.
import GPUtil

# vectorisation des textes
from sklearn.feature_extraction.text import CountVectorizer
import torch

# error ou warning : c:\Users\ehess\anaconda3\envs\my4env\lib\site-packages\umap\distances.py:1063: 
# NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. 
# The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. 
# See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.@numba.jit()

In [3]:
import spacy
from thinc.api import set_gpu_allocator, require_gpu, prefer_gpu, use_pytorch_for_gpu_memory, set_active_gpu
set_gpu_allocator("pytorch")
require_gpu() 
set_active_gpu(0)
if prefer_gpu():
    print("Using GPU!")
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    use_pytorch_for_gpu_memory()

    print("GPU Usage")
    GPUtil.showUtilization()

Using GPU!
GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  4% |  9% |


In [4]:
# pour un ordi avec GPU
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    print("Using GPU with spacy!")
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
    print("GPU Usage")
    GPUtil.showUtilization()

Using GPU with spacy!
GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  3% |  9% |


In [2]:
# df = pd.read_pickle("pickle_sentences_sentences.pkl") 
df = pd.read_csv("df_CC_cleaned.csv", sep = ",", encoding = "utf-8", dtype= str)
# df = pd.read_csv("df_CC_lemmas_postag.csv", sep = ",", encoding = "utf-8", dtype= str)

In [None]:
# après recherches, les NaN qu'on retrouve ici sont produites au moment du split et sont des espaces.

print("y a t il des nan? ->",df.isnull().values.any())
print("combien y a t il de nan? ->",df.isnull().values.sum())
print("où sont les null? ->\n",df.isnull().sum())
df[df.isnull().T.any()] # pour afficher les lignes qui contiennent des null

# on choisit de supprimer les rangs contenant une NaN.
# l'index est remis à jour, on conserve les information liées aux contributions
df.dropna(inplace = True, ignore_index = True)

print("y a t il des nan? ->",df.isnull().values.any())
print("combien y a t il de nan? ->",df.isnull().values.sum())
print("où sont les null? ->\n",df.isnull().sum())


In [8]:
# randomize all lines or a portion (0.02% ~= 480 000 sentences)
df = df.sample(frac= 0.4)
df = df.astype(str)
df["lemmas_no_stopwords"].to_csv("40_percent_for_100_clustersize_randomized_corpus.csv")
# turn sentences column into a list
# text_list = df.phrases.tolist()
text_list = df.lemmas_no_stopwords.tolist()


# delete dataframe
del df

GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  0% |  4% |


In [9]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [10]:
print(device)

cuda:0


In [11]:
# dl embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device = "cuda") # il trouve la device

# commande bash:  nvidia-smi -l 2 pour connaitre l'activité du gpu en temps réel

In [12]:
# Step 1 - Extract embeddings
# the trained Stence Transformer embedding produced with the La Javaness procedure, from CamemBert didn t work :

# embedding_model = SentenceTransformer(os.path.join('/Users/ehess/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large/', 'sentence_bert_config.json'))
# '/Users/ehess/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large'

# pour telecharger :
# from transformers.pipelines import pipeline
# embedding_model = pipeline("feature-extraction", model="dangvantuan/sentence-camembert-large")


In [13]:
# Step 1 - Extract embeddings
# WARNING : execute in ~15mn on sentences or 1 hour on contributions

embeddings = embedding_model.encode(text_list, batch_size=128, show_progress_bar=True)

Batches:   0%|          | 0/700 [00:00<?, ?it/s]

In [14]:
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  0% | 17% |


In [15]:
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')


In [16]:

# Step 3 - Cluster reduced embeddings
# min_cluster_size parameter allow us to choose how cluster are precise or larges
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom')
#hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [17]:
# Step 4 - Tokenize topics

# vectorizer_model= CountVectorizer(tokenizer=lemmatize)
vectorizer_model = CountVectorizer(ngram_range=(1,2))


In [18]:

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()


In [19]:
# Step 6 - (Optional) Fine-tune topic representations with a `bertopic.representation` model = the previous model

# representation_model = KeyBERTInspired()

In [20]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"


In [21]:
# HDBscan : test avec 5 taille de clusters

# taille 1 pour avoir tous les topics générés

# grid_parametres = [None,40,50,80,100]
grid_parametres = [100]



for size in grid_parametres : 
    print(f"run {size} cluster size try")
    # new HDBSCAN with new size cluster limit
    if size == None : 
        hdbscan_model = HDBSCAN(metric='euclidean', cluster_selection_method='eom')
    else :
        hdbscan_model = HDBSCAN(min_cluster_size= size, metric='euclidean', cluster_selection_method='eom')

    topic_model = BERTopic(
    top_n_words = 40, # 10 is default, we ask more words for qualitative evaluation
    n_gram_range = (1,3), # (1,1) is default, we ask a larger range
    nr_topics= "auto",
    low_memory = True, 
    calculate_probabilities = False,
    embedding_model=embedding_model,          # Step 1 - Extract embeddings
    umap_model=umap_model,                    # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    verbose = True
)
    GPUtil.showUtilization()
    print(1)
    topics, probs = topic_model.fit_transform(text_list)
    GPUtil.showUtilization()
    print(2)

    dd = [[item[0] if item[0] else pd.NA for item in topics]for topics in topic_model.get_topics().values()]
    GPUtil.showUtilization()
    print(3)

    ded = pd.DataFrame(dd)
    freq = topic_model.get_topic_info()
    GPUtil.showUtilization()
    print(4)


    output = pd.concat([freq, ded], axis=1)
    output.to_csv(f'echantillon_10000_dossier_topics/words_size_{size}_cluster_BERTopic.csv', index=False)
    GPUtil.showUtilization()
    print(5)


    topic_model.save(f"echantillon_10000_dossier_topics/size_{size}_cluster_model_saved")
    




run 100 cluster size try
| ID | GPU | MEM |
------------------
|  0 |  0% | 17% |
1


Batches:   0%|          | 0/2800 [00:00<?, ?it/s]

2023-07-03 17:24:54,819 - BERTopic - Transformed documents to Embeddings
2023-07-03 17:25:48,174 - BERTopic - Reduced dimensionality
2023-07-03 17:25:52,722 - BERTopic - Clustered reduced embeddings
2023-07-03 17:27:04,464 - BERTopic - Reduced number of topics from 59 to 59


| ID | GPU | MEM |
------------------
|  0 |  0% | 17% |
2
| ID | GPU | MEM |
------------------
|  0 |  0% | 17% |
3
| ID | GPU | MEM |
------------------
|  0 |  0% | 17% |
4
| ID | GPU | MEM |
------------------
|  0 |  0% | 17% |
5


  self._set_arrayXarray(i, j, x)


: 

In [None]:
topic_model = BERTopic.load("echantillon_10000_dossier_topics/size_30_cluster_model_saved")
df_used_by_model = pd.read_csv("randomized_40_percent_of_corpus.csv", sep = ",", encoding = "utf-8", dtype= str)
text_list = df_used_by_model.tolist()
freq = topic_model.get_topic_info()
print(freq.head(11))

In [32]:
# WARNING : all visualization tools are very heavy on RAM


fig = topic_model.visualize_barchart(top_n_topics=30)
fig.show()
fig.write_html("echantillon_10000_dossier_topics/size_30_cluster_barchart_topics.html")

In [34]:
# voir le détail des mots associés à un topic (defini par son num)
topic_model.get_topic(0)

[('il', 0.007070121588766829),
 ('faut', 0.0066121358873354255),
 ('faut il', 0.005641509961233225),
 ('est', 0.0045349400210977255),
 ('qu', 0.004507410099045476),
 ('faire', 0.004433543258651229),
 ('vous', 0.004392656876228051),
 ('comment', 0.0041767477362925414),
 ('citoyens', 0.0038521590783650163),
 ('non', 0.003598917636373227),
 ('pays', 0.003555641536660977),
 ('france', 0.0035445696091787656),
 ('français', 0.003302800299405365),
 ('vote', 0.0033000332571366964),
 ('élus', 0.003198314564655236),
 ('impôts', 0.0031251215453745934),
 ('président', 0.003088044927521465),
 ('qu il', 0.0029506933504021707),
 ('un', 0.002950087420717281),
 ('référendum', 0.002901842082570292),
 ('on', 0.002893446803246412),
 ('impôt', 0.0028393993462230973),
 ('dépenses', 0.0028163000204135683),
 ('nombre', 0.002799589193040574),
 ('république', 0.0027981434454988677),
 ('une', 0.0027587089100708853),
 ('démocratie', 0.0027575791773963923),
 ('trop', 0.002710687720352136),
 ('bien', 0.002685481753

In [35]:

# Two options :

# Run the visualization with the original embeddings
# topic_model.visualize_documents(text_list, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively: 
# reduced_embeddings = reducer(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# here i am reducing embeddings and creating a list of topics id for lowering the time of the map computation
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [36]:
# map of the topics on similarity

figure_topic = topic_model.visualize_documents(text_list, reduced_embeddings=reduced_embeddings)
# images_topic_echantillon

# topic_model.visualize_documents(text_list, embeddings)

figure_topic.write_html("echantillon_10000_dossier_topics/30_figure_topic_docs.html")

In [None]:
figure_topic.show()

In [33]:
# affiche tous les topics et leurs mots + scores
topic_model.get_topics()

{-1: [('est', 0.004197167521441983),
  ('qu', 0.0040315089696356336),
  ('faire', 0.0033999225234716455),
  ('france', 0.0033296646230078428),
  ('il', 0.0031385060467027143),
  ('vie', 0.0029432352353566385),
  ('retraites', 0.0028619389988845624),
  ('un', 0.0027680181013419927),
  ('français', 0.0027037239917640384),
  ('non', 0.0026813025962449384),
  ('état', 0.0026685227422978746),
  ('retraite', 0.0026169766146064053),
  ('pays', 0.0026087921468878287),
  ('bien', 0.0025465368154685515),
  ('une', 0.0025341062469401094),
  ('ans', 0.002533949282115569),
  ('faut', 0.002488050047001365),
  ('travail', 0.0024683884391676248),
  ('pouvoir', 0.0024542533868702645),
  ('suppression', 0.00241944089991349),
  ('salaires', 0.0023258895166473298),
  ('on', 0.002300697266687274),
  ('personnes', 0.0022194810484827375),
  ('en', 0.002216885399860198),
  ('services', 0.0022106151853563774),
  ('président', 0.0022102157655341857),
  ('2019', 0.0021641705654279956),
  ('impôts', 0.00215526926

In [None]:
fig = topic_model.visualize_topics()
fig.show()

In [53]:
fig.write_html("intertopic_distancemap_BERTopics.html")

In [59]:
first_100_topics = list(range(100))

In [27]:
hierarchical_topics = topic_model.hierarchical_topics(text_list)


100%|██████████| 140/140 [00:58<00:00,  2.38it/s]


In [28]:
hierarchie = topic_model.visualize_hierarchy()

In [29]:
hierarchie.show()