## INTRODUCTION




L’objectif de cette section est d’explorer les **structures latentes** du corpus sans recours aux `Tags` comme variable cible.  
Nous cherchons à comprendre comment les questions se regroupent naturellement selon leur contenu sémantique, afin d’éventuellement **enrichir ou automatiser les suggestions de tags**.

Les techniques utilisées ici incluent :

- **LDA (Latent Dirichlet Allocation)** : pour extraire des *topics* latents et comprendre les thématiques présentes dans les questions
- **Méthodes de Clustering (KMeans, DBSCAN, etc.)** : pour segmenter les questions selon leur similarité vectorielle

Ces approches permettent :
- Une lecture qualitative des **sous-communautés thématiques**
- Une aide à la navigation ou à la classification implicite du corpus
- Une base de travail pour construire des outils de **suggestion de tags intelligents**


## IMPORTS

In [1]:
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import Sparse2Corpus
import joblib
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
from collections import defaultdict

## 1. MODELE LDA

### **1.1. CHARGEMENT DES DONNEES**

In [2]:
# --- CHARGEMENT DES DONNEES
import pandas as pd
df_corpus = pd.read_parquet("data/processed/full_explo_wo.parquet")
corpus = df_corpus["clean_title_body"].tolist()  # Texte fusionné nettoyé

import scipy.sparse
import pickle
X_bow = scipy.sparse.load_npz("models/bow/X_bow_full.npz")
with open("models/bow/vocab_bow_full.pkl", "rb") as f:
    vocab = pickle.load(f)


### **1.2. PREPARATION DES DONNEES**

In [3]:

# --- CREATION DU DICTIONNAIRE GENSIM
id2word = corpora.Dictionary()
id2word.id2token = dict(enumerate(vocab))
id2word.token2id = {v: k for k, v in id2word.id2token.items()}

# --- CONVERSION SPARSE MATRIC → FORMAT GENSIM
corpus_gensim = Sparse2Corpus(X_bow, documents_columns=False)


### **1.3. PARAMETRAGE ET ENTRAINEMENT DU MODELE LDA**

In [4]:

# --- PARAMETRES DU MODELE
num_topics = 10
random_state = 42

# --- ENTRAINEMENT
lda_model = LdaModel(
    corpus=corpus_gensim,
    id2word=id2word,
    num_topics=num_topics,
    random_state=random_state,
    passes=10,
    chunksize=100,
    alpha='auto',
    per_word_topics=True
)

print("# --- Modèle LDA entraîné avec", num_topics, "topics")

# --- AFFICHAGE DES 5 MOTS LES PLUS REPRESENTATIFS PAR TOPIC
for i in range(num_topics):
    print(f"\n# ---  Topic {i}:")
    print([word for word, prob in lda_model.show_topic(i, topn=5)])


# --- Modèle LDA entraîné avec 10 topics

# ---  Topic 0:
['url', 'web', 'client', 'javascript', 'json']

# ---  Topic 1:
['datum', 'time', 'point', 'em', 'end']

# ---  Topic 2:
['android', 'python', 'application', 'app', 'task']

# ---  Topic 3:
['number', 'pre', 'include', 'template', 'difference']

# ---  Topic 4:
['sql', 'query', 'header', 'configuration', 'sort']

# ---  Topic 5:
['thread', 'event', 'copy', 'download', 'custom']

# ---  Topic 6:
['user', 'server', 'database', 'post', 'connection']

# ---  Topic 7:
['image', 'text', 'button', 'ios', 'description']

# ---  Topic 8:
['java', 'reference', 'blockquote', 'windows', 'date']

# ---  Topic 9:
['gt', 'key', 'php', 'item', 'form']


### **1.4. VISUALISATION DES TOPICS**

#### **1.4.1. DISTRIBUTION DES TERMES DANS LES TOPICS**

In [5]:



# --- PREPARATION DES DONNEES POUR LA VISUALISATION
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus_gensim, id2word)

# --- AFFICHAGE
pyLDAvis.display(vis_data)


#### **1.4.2. ATTRIBUTION DU TOPIC DOMINANT A CHAQUE DOCUMENT**

In [6]:
print(df_corpus.columns.tolist())


# --- ATTRIBUTION DU TOPIC DOMINANT A CHAQUE DOCUMENT
topic_assignments = []
for doc_bow in corpus_gensim:
    topic_probs = lda_model.get_document_topics(doc_bow)
    top_topic = sorted(topic_probs, key=lambda x: x[1], reverse=True)[0][0]
    topic_assignments.append(top_topic)

# --- AJOUT AU DATAFRAME
df_corpus["dominant_topic"] = topic_assignments
# --- TODO 
# Après attribution du dominant_topic
# (df_corpus contient déjà les colonnes binaires des tags)
import joblib
mlb_full = joblib.load("models/tags/multilabel_binarizer_full.pkl")
topic_to_tags = (
    df_corpus.groupby("dominant_topic")[mlb_full.classes_]
    .sum()  # somme des 1 par tag dans le topic
    .apply(lambda row: row.sort_values(ascending=False).head(3).index.tolist(), axis=1)
)
display(topic_to_tags)
import joblib
from pathlib import Path
# S'assurer que le répertoire existe
Path("models/lda").mkdir(parents=True, exist_ok=True)
joblib.dump(topic_to_tags, "models/lda/topic_to_tags.pkl")
from pathlib import Path
import joblib
# --- sauvegardes :
# 1) Modèle LDA
lda_model.save("models/lda/lda_model.gensim")
# 2) Dictionnaire (lexique utilisé pour doc2bow)
id2word.save("models/lda/id2word.dict")
# 3) Mapping topic -> tags fréquents (déjà calculé)
Path("models/lda").mkdir(parents=True, exist_ok=True)
joblib.dump(topic_to_tags, "models/lda/topic_to_tags.pkl")
# --- FIN TODO


# --- APERCU
df_corpus[["PostId", "dominant_topic", "clean_title_body"]].head()

df_corpus.to_csv("../data/processed/corpus_topic_assignments.csv", index=False)





dominant_topic
0    [javascript, java, jquery]
1            [python, c#, java]
2         [android, python, c#]
3              [c++, python, c]
4               [sql, c#, java]
5               [c#, c++, .net]
6            [c#, python, java]
7    [ios, objective-c, iphone]
8               [java, c#, c++]
9         [php, c#, javascript]
dtype: object

#### **1.4.3. CLASSEMENT DES QUESTIONS PAR TOPIC**

In [7]:


# --- STRUCTURE POUR STOCKER LES MEILLEURES QUESTIONS PAR TOPIC
top_docs_by_topic = defaultdict(list)

# --- BOUCLE SUR CHAQUE DOCUMENT ( document = ligne du corpus)
for i, doc_bow in enumerate(corpus_gensim):
    topic_probs = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    sorted_topics = sorted(topic_probs, key=lambda x: x[1], reverse=True)
    top_topic, top_score = sorted_topics[0]
    
    # --- STOCKER L'INDEX DU DOCUMENT + SON SCORE SI LE QUOTA N'EST PAS ATTEINT
    if len(top_docs_by_topic[top_topic]) < 3:
        top_docs_by_topic[top_topic].append((i, top_score))

# --- AFFICHAGE
for topic_id in range(num_topics):
    print(f"\n# --- Topic {topic_id} — mots clés :", [word for word, _ in lda_model.show_topic(topic_id, topn=5)])
    for i_doc, score in top_docs_by_topic[topic_id]:
        print(f"# --- Score {score:.3f} — Question:")
        print(df_corpus.loc[i_doc, "clean_title_body"][:250], "...\n")



# --- Topic 0 — mots clés : ['url', 'web', 'client', 'javascript', 'json']
# --- Score 0.755 — Question:
reactive programming different event drive programming learn reactive programming functional reactive programming javascript wikipedia reactive imperative oorp functional event drive reactive reactive programming relate promises alternative event dri ...

# --- Score 0.271 — Question:
password hashing reason jbcrypt plan password hash web application suppose reason ul maven jbcrypt bcrypt downer maven possible jbcrypt password hash local available reason low number ...

# --- Score 0.210 — Question:
reshape datum frame matrix long format x x b x b matrix form feed heatmap plot b x reshape manual able ...


# --- Topic 1 — mots clés : ['datum', 'time', 'point', 'em', 'end']
# --- Score 0.670 — Question:
cluster datum pd pd j b j b plot dataframe index plot axis ax ax ax concat dataset image description order etc guess trick found hr image description cluster order visually x label r

### 1.5. ÉVALUATION ET EXPLOITATION MATRICIELLE DU MODÈLE LDA

#### 1.5.1. SEPARATION DU CORPUS EN TRAIN/TEST

In [None]:
# from sklearn.model_selection import train_test_split
# from gensim.matutils import Sparse2Corpus

# # 1. Séparer les indices des documents (80% train / 20% test ici)
# train_idx, test_idx = train_test_split(
#     range(X_bow.shape[0]),
#     test_size=0.2,
#     random_state=42
# )

# # 2. Extraire les sous-matrices CSR
# X_bow_train = X_bow[train_idx, :]
# X_bow_test  = X_bow[test_idx, :]

# # 3. Convertir en format Gensim (liste de tuples (id_mot, fréquence))
# corpus_train = Sparse2Corpus(X_bow_train, documents_columns=False)
# corpus_test  = Sparse2Corpus(X_bow_test, documents_columns=False)

# print(f"✅ Corpus séparé : {len(train_idx)} documents en train / {len(test_idx)} en test")


In [8]:
# ---------------
# --- CORRECTIF
# ---------------
# 📦 Imports
from sklearn.model_selection import train_test_split
from gensim import corpora

# 🗂 Hypothèse : ton DataFrame complet s'appelle df_corpus 
# et possède une colonne 'clean_title_body' (texte nettoyé)
# et un objet 'mlb_full' déjà fit sur les tags pour créer tags_cols

# 1️⃣ Séparation en DataFrames train/test
df_train, df_test = train_test_split(
    df_corpus,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# 2️⃣ Construction du vocabulaire (Dictionary Gensim)
texts_train = [txt.split() for txt in df_train["clean_title_body"]]
texts_test  = [txt.split() for txt in df_test["clean_title_body"]]

dictionary = corpora.Dictionary(texts_train)  # uniquement sur le train
vocab = [dictionary[i] for i in range(len(dictionary))]  # pour index → mot

# 3️⃣ Conversion BoW
corpus_train = [dictionary.doc2bow(text) for text in texts_train]
corpus_test  = [dictionary.doc2bow(text) for text in texts_test]

# 🔍 Contrôles rapides
print(f"Train docs: {len(df_train)} | Test docs: {len(df_test)}")
print(f"Vocab size: {len(dictionary)}")
print("Exemple BoW train[0]:", corpus_train[0][:10])


Train docs: 7272 | Test docs: 1819
Vocab size: 20503
Exemple BoW train[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 2)]


#### 1.5.2. ENTRAINEMENT DU LDA SUR LE CORPUS TRAIN

In [10]:
from gensim.models import LdaModel

lda_model = LdaModel(
    corpus=corpus_train,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    chunksize=100,
    alpha='auto',
    per_word_topics=True
)

print("✅ LDA entraîné sur corpus Train")


✅ LDA entraîné sur corpus Train


#### 1.5.3 Matrices principales issues du LDA

In [11]:
import numpy as np

# M_topics_words : shape (num_topics, vocab_size)
M_topics_words = lda_model.get_topics()

# M_train_quest_topics : shape (n_train_docs, num_topics)
M_train_quest_topics = np.zeros((len(corpus_train), num_topics))
for i, bow in enumerate(corpus_train):
    for topic_id, prob in lda_model.get_document_topics(bow, minimum_probability=0.0):
        M_train_quest_topics[i, topic_id] = prob

# M_train_quest_words = M_train_quest_topics × M_topics_words
M_train_quest_words = M_train_quest_topics @ M_topics_words

M_topics_words.shape, M_train_quest_topics.shape, M_train_quest_words.shape


((10, 20503), (7272, 10), (7272, 20503))

#### 1.5.4 Prédiction sur le corpus Test

In [12]:
# M_test_quest_topics
M_test_quest_topics = np.zeros((len(corpus_test), num_topics))
for i, bow in enumerate(corpus_test):
    for topic_id, prob in lda_model.get_document_topics(bow, minimum_probability=0.0):
        M_test_quest_topics[i, topic_id] = prob

# M_test_quest_words
M_test_quest_words = M_test_quest_topics @ M_topics_words


#### 1.5.5 Application d'un seuil pour filtrer les associations faibles

In [14]:
SEUIL_TOPIC = 0.05
SEUIL_WORD = 0.01

M_train_quest_topics_thresh = np.where(M_train_quest_topics >= SEUIL_TOPIC, M_train_quest_topics, 0)
M_test_quest_topics_thresh  = np.where(M_test_quest_topics  >= SEUIL_TOPIC, M_test_quest_topics, 0)

M_train_quest_words_thresh = np.where(M_train_quest_words >= SEUIL_WORD, M_train_quest_words, 0)
M_test_quest_words_thresh  = np.where(M_test_quest_words  >= SEUIL_WORD, M_test_quest_words, 0)


#### 1.5.6 Mesures orientées métier

In [20]:
# --- Taux de couverture des tags réels ---
from sklearn.metrics import accuracy_score
import unidecode

tags_cols = mlb_full.classes_

# Fonction de normalisation des mots
def normalize(text):
    return unidecode.unidecode(text.lower().strip())

# Recouvrement = proportion des tags d'origine retrouvés dans les mots-clés extraits
def taux_couverture_tags(M_q_w, df_subset):
    topn = 3
    # Indices des top mots par question
    top_word_idx = np.argsort(M_q_w, axis=1)[:, -topn:]

    # Récupération des top mots normalisés
    top_words_per_doc = [
        [normalize(vocab[i]) for i in row[::-1]]  # les plus probables en premier
        for row in top_word_idx
    ]

    couverture = []
    for top_words, tags in zip(top_words_per_doc, df_subset[tags_cols].values):
        mots_tags = set([normalize(t) for t, val in zip(tags_cols, tags) if val == 1])
        intersection = set(top_words) & mots_tags
        couverture.append(len(intersection) / len(mots_tags) if mots_tags else 0)
    return np.mean(couverture)

# Calcul des taux de couverture
cov_train = taux_couverture_tags(M_train_quest_words_thresh, df_train)
cov_test  = taux_couverture_tags(M_test_quest_words_thresh, df_test)

print(f"📊 Couverture tags — Train : {cov_train:.2%} | Test : {cov_test:.2%}")


📊 Couverture tags — Train : 1.13% | Test : 1.04%


#### 1.5.7 Taux de couverture des mots originaux

In [21]:
def taux_couverture_mots(M_q_w, df_subset):
    top_word_idx = np.argmax(M_q_w, axis=1)
    top_words = [vocab[i] for i in top_word_idx]
    cov = []
    for text, word in zip(df_subset["clean_title_body"], top_words):
        cov.append(1 if word in text.split() else 0)
    return np.mean(cov)

cov_words_train = taux_couverture_mots(M_train_quest_words_thresh, df_train)
cov_words_test  = taux_couverture_mots(M_test_quest_words_thresh, df_test)

print(f"📊 Couverture mots — Train : {cov_words_train:.2%} | Test : {cov_words_test:.2%}")


📊 Couverture mots — Train : 17.41% | Test : 17.10%


#### 1.5.8 Vérification qualitative sur échantillon

In [22]:
N = 3
echantillon_idx = np.random.choice(len(df_test), N, replace=False)

for idx in echantillon_idx:
    texte = df_test.iloc[idx]["clean_title_body"]
    mots_cles_idx = np.argsort(M_test_quest_words_thresh[idx])[::-1][:5]
    mots_cles = [vocab[i] for i in mots_cles_idx if M_test_quest_words_thresh[idx, i] > 0]
    print("\n--- Question ---\n", texte[:250], "...")
    print("Mots-clés extraits :", mots_cles)



--- Question ---
 dynamic javascript tell dynamic username abcd customer uname firstname username firstname username reffere inside dynamic ...
Mots-clés extraits : ['user']

--- Question ---
 rail choice database end end rail cover recommend controller rail wonder prefer ...
Mots-clés extraits : ['image']

--- Question ---
 gradle duplicate zipexception integrate zendesk mobile sdk maven anymore clash picasso execution task app packagealldebugclassesformultidex gt zipexception duplicate com squareup picasso app zendesk support appcompat zendesk picasso different exclude ...
Mots-clés extraits : ['datum']


## 2.MODELE MINIBATCHKMEANS

In [None]:
from scipy.sparse import load_npz

X_tfidf = load_npz("models/tfidf/X_tfidf_full.npz")
X_dense = X_tfidf.toarray()  # nécessaire pour GMM

import numpy as np

# Si les vecteurs sont en sparse matrix → conversion vers dense
X_dense = X_tfidf.toarray()
print("✅ Vecteurs convertis → shape :", X_dense.shape)
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA

X_reduced = PCA(n_components=50).fit_transform(X_dense)
kmeans = MiniBatchKMeans(n_clusters=8, batch_size=256, random_state=42)
labels = kmeans.fit_predict(X_reduced)

import pandas as pd
df_clusters_kmeans = pd.DataFrame({
    "document": df_corpus["clean_title_body"],
    "cluster": labels
})

cluster_counts = df_clusters_kmeans["cluster"].value_counts().sort_index()
print("📊 Répartition des documents par cluster :")
print(cluster_counts)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

for c in sorted(df_clusters_kmeans["cluster"].unique()):
    cluster_text = " ".join(df_clusters_kmeans[df_clusters_kmeans["cluster"] == c]["document"])
    wc = WordCloud(width=800, height=400, background_color="white", colormap="tab10").generate(cluster_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Cluster {c} — Wordcloud")
    plt.show()


In [None]:
from sklearn.manifold import TSNE

X_2d = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X_reduced)
plt.figure(figsize=(10, 6))
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='tab10', alpha=0.7)
plt.title("📍 Projection TSNE des clusters MiniBatchKMeans")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.colorbar(label="Cluster")
plt.show()


## ANNEXES