# Topic modeling 
Objective : identify topics emerging from comments

##  Required packages

In [28]:
#import warnings
#warnings.filterwarnings('ignore')

import pandas as pd
import gensim

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models

from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer



## Load data

In [29]:
# load the data
df = pd.read_csv("data_for_tpm.csv")

# check nan values
print(df.isna().sum())

print(df.info())

Unnamed: 0            0
comment               0
rating                0
comment_cleaned       0
comment_lemmatized    0
comment_stem          0
stem_len              0
comment_topics        0
cmt_length            0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22462 entries, 0 to 22461
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          22462 non-null  int64 
 1   comment             22462 non-null  object
 2   rating              22462 non-null  int64 
 3   comment_cleaned     22462 non-null  object
 4   comment_lemmatized  22462 non-null  object
 5   comment_stem        22462 non-null  object
 6   stem_len            22462 non-null  int64 
 7   comment_topics      22462 non-null  object
 8   cmt_length          22462 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.5+ MB
None


In [30]:
# split data into training and test
from sklearn.model_selection import train_test_split
X = df["comment_topics"]
y= df["rating"]

doc_train, doc_test, y_train, y_test = train_test_split(X, y, 
                                                        stratify=y, 
                                                        test_size=0.25)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [31]:
# Join all the comment_topics into a single doc : doc_to_topics
doc_to_topics = [cmt.split() for cmt in doc_train]

# put data into a gensim dictionnary and then convert it to a Bag of Words, 
# that is couple word/"frequency of occurences"

dictionary = gensim.corpora.Dictionary(doc_to_topics)

dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=1000)

bow_corpus = [dictionary.doc2bow(doc) for doc in doc_to_topics]


### LDA implementation

In [18]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics = 10, 
                                       id2word = dictionary, 
                                       passes = 1000)

In [19]:
topics = []
for idx, topic in lda_model.print_topics(-1) :
    print("Topic: {} -> Words: {}".format(idx, topic))
    topics.append(topic)

Topic: 0 -> Words: 0.268*"prix" + 0.084*"rapport" + 0.029*"frais" + 0.028*"paiement" + 0.025*"linge" + 0.019*"meuble" + 0.019*"maison" + 0.018*"réduction" + 0.018*"correct" + 0.017*"port"
Topic: 1 -> Words: 0.061*"efficace" + 0.041*"housse" + 0.039*"couette" + 0.029*"draps" + 0.025*"belle" + 0.024*"satisfait" + 0.024*"redire" + 0.023*"acheté" + 0.021*"aussi" + 0.019*"sérieux"
Topic: 2 -> Words: 0.059*"redoute" + 0.029*"articles" + 0.020*"retour" + 0.019*"site" + 0.015*"trouve" + 0.015*"dommage" + 0.014*"vêtements" + 0.014*"pratique" + 0.012*"autres" + 0.011*"passé"
Topic: 3 -> Words: 0.184*"conforme" + 0.073*"rapidement" + 0.061*"expédition" + 0.059*"article" + 0.045*"description" + 0.038*"attentes" + 0.026*"ravie" + 0.025*"réception" + 0.024*"colis" + 0.022*"vite"
Topic: 4 -> Words: 0.066*"taille" + 0.039*"correspond" + 0.037*"couleur" + 0.029*"site" + 0.027*"commandé" + 0.027*"photo" + 0.024*"petit" + 0.020*"parfaite" + 0.020*"article" + 0.020*"agréable"
Topic: 5 -> Words: 0.077*"ser

### Model coherence

Les mesures de cohérence évaluent le degré de similitude sémantique entre les mots les mieux notés dans le topics. Ces mesures aident à faire la distinction entre les topics sémantiquement interprétables et les topics dû à des inférences statistiques.

Pour un bon modèle LDA la cohérence doit être comprise entre 0.4 et 0.7 au-delà et en dessous le modèle est très probablement erroné. La cohérence pour un modèle LDA est calculée en procédant aux étapes suivantes :

- Segmentation : création de paires de mots à partir de sous-ensembles ;
- Calcul des probabilités : calcul probabilité d’occurrence d’un mot ;
- Mesure de confirmation : vérification « dans quelle mesure » un sous-ensemble de mots supporte un autre sous-ensemble de mots dans chaque paire ;
- Agrégation : agrégation de toutes les valeurs calculées à l’étape précédente en une seule valeur qui est notre score final de cohérence de sujet.

Source: La revue IA 
https://larevueia.fr/latent-dirichlet-allocation-topic-modeling-en-python/

In [20]:
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=doc_to_topics, 
                                     dictionary=dictionary)
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.41560903153425494


xxx

### Save Results

In [21]:
all_topic_model = []
for i in range(len(topics)):
  str = topics[i].split(' + ')
  topic_model = []
  for j in range(10):
    weight = str[j][0:5]
    word = str[j][7:len(str[j])-1]
    topic_model.append((weight, word))
  all_topic_model.append(topic_model)

In [22]:
df_topic_model = pd.DataFrame(all_topic_model)
df_topic_model.rename(index = {0: "Topic 1", 1: "Topic 2", 
                               2: "Topic 3", 3: "Topic 4", 
                               4: "Topic 5", 5: "Topic 6", 
                               6: "Topic 7", 7: "Topic 8", 
                               8: "Topic 9", 9: "Topic 10"})

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic 1,"(0.268, prix)","(0.084, rapport)","(0.029, frais)","(0.028, paiement)","(0.025, linge)","(0.019, meuble)","(0.019, maison)","(0.018, réduction)","(0.018, correct)","(0.017, port)"
Topic 2,"(0.061, efficace)","(0.041, housse)","(0.039, couette)","(0.029, draps)","(0.025, belle)","(0.024, satisfait)","(0.024, redire)","(0.023, acheté)","(0.021, aussi)","(0.019, sérieux)"
Topic 3,"(0.059, redoute)","(0.029, articles)","(0.020, retour)","(0.019, site)","(0.015, trouve)","(0.015, dommage)","(0.014, vêtements)","(0.014, pratique)","(0.012, autres)","(0.011, passé)"
Topic 4,"(0.184, conforme)","(0.073, rapidement)","(0.061, expédition)","(0.059, article)","(0.045, description)","(0.038, attentes)","(0.026, ravie)","(0.025, réception)","(0.024, colis)","(0.022, vite)"
Topic 5,"(0.066, taille)","(0.039, correspond)","(0.037, couleur)","(0.029, site)","(0.027, commandé)","(0.027, photo)","(0.024, petit)","(0.020, parfaite)","(0.020, article)","(0.020, agréable)"
Topic 6,"(0.077, service)","(0.040, temps)","(0.032, client)","(0.031, colis)","(0.028, date)","(0.025, redoute)","(0.022, prévue)","(0.020, jours)","(0.017, prévu)","(0.016, heure)"
Topic 7,"(0.133, choix)","(0.057, site)","(0.056, produits)","(0.046, redoute)","(0.036, articles)","(0.030, depuis)","(0.026, facilité)","(0.022, achats)","(0.020, grand)","(0.019, cliente)"
Topic 8,"(0.162, envoi)","(0.084, retour)","(0.074, article)","(0.055, facilité)","(0.049, achat)","(0.048, problème)","(0.038, contente)","(0.036, efficacité)","(0.034, apprécié)","(0.022, aucun)"
Topic 9,"(0.071, colis)","(0.058, relais)","(0.040, point)","(0.031, reçu)","(0.017, commandé)","(0.014, domicile)","(0.014, livré)","(0.012, lendemain)","(0.012, article)","(0.010, bonjour)"
Topic 10,"(0.157, produits)","(0.122, articles)","(0.081, délai)","(0.077, délais)","(0.062, conformes)","(0.037, suivi)","(0.030, respecté)","(0.025, commandés)","(0.023, respect)","(0.022, ultra)"


### Results Visualization

In [23]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,comment,rating,comment_cleaned,comment_lemmatized,comment_stem,stem_len,comment_topics,cmt_length
0,0,"Facilte, sollicitations dosées...qualité..on p...",4,facilte sollicitations dosées qualité pourrait...,facilte sollicitation doser qualité pouvoir ce...,facilt sollicit dos qualit pouvoir cepend amél...,558,facilte sollicitations dosées qualité pourrait...,642
1,1,Livraison impeccable avec possibilité de prend...,4,livraison impeccable possibilité prendre samed...,livraison impeccable possibilité prendre samed...,livraison impecc possibil prendr samed matin b...,461,livraison impeccable possibilité prendre samed...,488
2,2,Achat d'un canapé La Redoute intérieurs le 6/9...,4,achat canapé redoute intérieurs site redoute a...,achat canapé redoute intérieur site redoute ab...,achat canap redout intérieur sit redout abord ...,426,achat canapé redoute intérieurs site redoute a...,459
3,3,je commande depuis longtemps chez la redoute; ...,4,commande depuis longtemps redoute appréciais c...,commande depuis longtemps redoute appréciai ch...,command depuis longtemp redout appréci choix r...,370,commande depuis longtemps redoute appréciais c...,417
4,4,"Comme d'habitude, aucun souci et rapidité d'ex...",5,habitude aucun souci rapidité expédition colis...,habitude aucun souci rapidité expédition colis...,habitud aucun souc rapid expédit colis bravo d...,366,habitude aucun souci rapidité expédition colis...,407


### Topic modeling on comments text cleaned and stemmed

In [32]:
# split data into training and test
from sklearn.model_selection import train_test_split
X = df["comment_stem"]
y= df["rating"]

doc_train, doc_test, y_train, y_test = train_test_split(X, y, 
                                                        stratify=y, 
                                                        test_size=0.25)

# Join all the comment_topics into a single doc : doc_to_topics
doc_to_topics = [cmt.split() for cmt in doc_train]

# put data into a gensim dictionnary and then convert it to a Bag of Words, 
# that is couple word/"frequency of occurences"

dictionary = gensim.corpora.Dictionary(doc_to_topics)

dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=1000)

bow_corpus = [dictionary.doc2bow(doc) for doc in doc_to_topics]

# train LDA model
lda_model2 = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics = 10, 
                                       id2word = dictionary, 
                                       passes = 1000)

# Model coherence
coherence_model_lda = CoherenceModel(model=lda_model2, 
                                     texts=doc_to_topics, 
                                     dictionary=dictionary)
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

# save result
all_topic_model = []
for i in range(len(topics)):
  str = topics[i].split(' + ')
  topic_model = []
  for j in range(10):
    weight = str[j][0:5]
    word = str[j][7:len(str[j])-1]
    topic_model.append((weight, word))
  all_topic_model.append(topic_model)

# save results into a dataframe
df_topic_model = pd.DataFrame(all_topic_model)
df_topic_model.rename(index = {0: "Topic 1", 1: "Topic 2", 
                               2: "Topic 3", 3: "Topic 4", 
                               4: "Topic 5", 5: "Topic 6", 
                               6: "Topic 7", 7: "Topic 8", 
                               8: "Topic 9", 9: "Topic 10"})

# visualize results
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model2, bow_corpus, dictionary)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Coherence Score:  0.5083791059590385


In [33]:
# visualize results
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model2, bow_corpus, dictionary)

In [45]:
topics

['0.268*"prix" + 0.084*"rapport" + 0.029*"frais" + 0.028*"paiement" + 0.025*"linge" + 0.019*"meuble" + 0.019*"maison" + 0.018*"réduction" + 0.018*"correct" + 0.017*"port"',
 '0.061*"efficace" + 0.041*"housse" + 0.039*"couette" + 0.029*"draps" + 0.025*"belle" + 0.024*"satisfait" + 0.024*"redire" + 0.023*"acheté" + 0.021*"aussi" + 0.019*"sérieux"',
 '0.059*"redoute" + 0.029*"articles" + 0.020*"retour" + 0.019*"site" + 0.015*"trouve" + 0.015*"dommage" + 0.014*"vêtements" + 0.014*"pratique" + 0.012*"autres" + 0.011*"passé"',
 '0.184*"conforme" + 0.073*"rapidement" + 0.061*"expédition" + 0.059*"article" + 0.045*"description" + 0.038*"attentes" + 0.026*"ravie" + 0.025*"réception" + 0.024*"colis" + 0.022*"vite"',
 '0.066*"taille" + 0.039*"correspond" + 0.037*"couleur" + 0.029*"site" + 0.027*"commandé" + 0.027*"photo" + 0.024*"petit" + 0.020*"parfaite" + 0.020*"article" + 0.020*"agréable"',
 '0.077*"service" + 0.040*"temps" + 0.032*"client" + 0.031*"colis" + 0.028*"date" + 0.025*"redoute" + 0.