<a href="https://colab.research.google.com/github/OpetherMB/grand-debat/blob/main/topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from pprint import pprint
import pickle


nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import  word_tokenize


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LsiModel
from gensim.models import CoherenceModel,HdpModel


# spacy for lemmatization
import spacy


# Plotting tools
!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim 
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
pyLDAvis.enable_notebook()

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


import re
import os
pd.set_option('max_colwidth', 400)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 5.3MB/s 
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/66/89/479de0afbbfb98d1c4b887936808764627300208bb771fcd823403645a36/funcy-1.15-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=68da68d304d53ce1f3767ea9d4a5de6022359aa07f1402c61617641338579680
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Su

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab Notebooks/grand debat
!pwd

/content/drive/MyDrive/Colab Notebooks/grand debat
/content/drive/MyDrive/Colab Notebooks/grand debat


In [4]:
# local lib

import util_topicModelling as  tpc

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Prepare data already cleaned and grouped

In [5]:
cleaned_data = pd.read_csv('clean_environement.csv', low_memory=False, dtype = str)
cleaned_data.fillna('')
cleaned_data.head()

Unnamed: 0.1,Unnamed: 0,authorZipCode,title,problems,reponse_probleme
0,0,97231,transition ecologique,,
1,1,57000,surpopulation,problemes auxquels trouve confronte ensemble planete denoncent parfait desordre gilets jaunes france dus avant surpopulation mondiale cette population passee milliards habitants milliards montera bientot milliards vers progres communication village mondial chaque individu fin fond asie fin fond afrique passant quartiers campagnes pays aspire vivre peu bl mer mal lotis concitoyens logement nour...,problemes auxquels trouve confronte ensemble planete denoncent parfait desordre gilets jaunes france dus avant surpopulation mondiale cette population passee milliards habitants milliards montera bientot milliards vers progres communication village mondial planete chaque individu fin fond asie fin fond afrique passant quartiers campagnes pays aspire vivre peu bl mer mal lotis concitoyens logem...
2,2,34140,climat,dereglements climatiques crue secheresse,eviter inondations obliger riverains communes nettoyer secteur
3,3,17400,pollution air eau,pollution air,taxer gros pollueurs entreprises bateaux marchands routiers etc
4,4,35430,economie vs ecologie,biodiversite disparition certaines especes,changer mode vie impulser nouvelle economie agricole


## Topic Modeling with Title column

### Build Model with non grouped data

In [6]:
list(cleaned_data['title'][2470:2475])

['vraie transition ecologique',
 'energies alternatives',
 'voiture avion paquebots portes contenairs',
 'mobilite',
 nan]

In [7]:
cleaned_data.fillna('',inplace=True)
count_nan = len(cleaned_data['title']) - cleaned_data['title'].count()
count_nan

0

# Using LDA

lets create a simple topic model with 8 topics using  Latent Dirichlet Allocation

### LDA with title column

In [8]:
data = list(cleaned_data['title'])
lda_model, corpus, id2word = tpc.get_topics(data, n_topics=8)

[(0,
  '0.056*"transports" + 0.031*"developper" + 0.030*"vie" + 0.029*"enfants" + '
  '0.029*"energies" + 0.027*"avenir" + 0.025*"taxe" + 0.020*"terre" + '
  '0.019*"carbone" + 0.015*"ecologiques"'),
 (1,
  '0.167*"ecologie" + 0.034*"taxer" + 0.026*"etre" + 0.025*"pollueurs" + '
  '0.023*"economie" + 0.014*"responsable" + 0.014*"citoyen" + '
  '0.012*"deplacements" + 0.012*"modele" + 0.011*"rendre"'),
 (2,
  '0.029*"agir" + 0.026*"climat" + 0.013*"temps" + 0.013*"ferroutage" + '
  '0.013*"toutes" + 0.013*"grand" + 0.013*"societe" + 0.013*"sante" + '
  '0.012*"conscience" + 0.012*"oui"'),
 (3,
  '0.041*"urgence" + 0.032*"climatique" + 0.029*"priorite" + 0.028*"produits" '
  '+ 0.022*"electrique" + 0.020*"contribution" + 0.019*"economique" + '
  '0.015*"voiture" + 0.013*"rechauffement" + 0.013*"mettre"'),
 (4,
  '0.064*"planete" + 0.035*"consommation" + 0.025*"reduire" + '
  '0.023*"vehicules" + 0.018*"limiter" + 0.017*"interdire" + 0.017*"sauver" + '
  '0.016*"education" + 0.014*"preser

In [9]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

on peut voir les different cluster quir represente les topic et ainsi  la frequence elvee des terms ...  ecologie , pollution, envirenment

## Topic Modeling For Problems column

In [10]:
cleaned_problems = cleaned_data['problems']
cleaned_problems.fillna('',inplace=True)
lda_model, corpus, id2word = tpc.get_topics(cleaned_problems, n_topics=5)

[(0,
  '0.021*"problemes" + 0.018*"importants" + 0.014*"lies" + 0.012*"ressources" '
  '+ 0.011*"autres" + 0.007*"propositions" + 0.006*"produits" + 0.006*"points" '
  '+ 0.006*"naturelles" + 0.005*"planete"'),
 (1,
  '0.398*"pollution" + 0.359*"air" + 0.026*"eau" + 0.014*"sols" + '
  '0.009*"energie" + 0.007*"fois" + 0.006*"impossible" + 0.005*"degradation" + '
  '0.005*"cause" + 0.005*"urgence"'),
 (2,
  '0.252*"biodiversite" + 0.238*"disparition" + 0.235*"especes" + '
  '0.225*"certaines" + 0.000*"accelerant" + 0.000*"ruralite" + '
  '0.000*"entrepreneuriat" + 0.000*"obsolescence" + 0.000*"programmee" + '
  '0.000*"obsolesence"'),
 (3,
  '0.024*"climatique" + 0.020*"dechets" + 0.016*"probleme" + 0.016*"erosion" + '
  '0.016*"toutes" + 0.015*"environnement" + 0.014*"dereglement" + '
  '0.014*"littoral" + 0.013*"important" + 0.013*"etre"'),
 (4,
  '0.249*"climatiques" + 0.248*"dereglements" + 0.233*"secheresse" + '
  '0.233*"crue" + 0.000*"analyser" + 0.000*"tremblements" + 0.000*"dis

In [11]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

La colonne des problèmes a donné de meilleurs résultats que le title qui semble logique puisque c'est celui qui contient le plus probablement des problèmes.

## Topic Modeling For Problems response column



In [12]:
data = list(cleaned_data['reponse_probleme'])
lda_model, corpus, id2word = tpc.get_topics(data, n_topics=5)

[(0,
  '0.011*"monde" + 0.010*"planete" + 0.010*"animaux" + 0.009*"nature" + '
  '0.008*"o" + 0.008*"especes" + 0.008*"autre" + 0.007*"aides" + 0.007*"hui" + '
  '0.007*"aujourd"'),
 (1,
  '0.018*"eau" + 0.016*"vie" + 0.011*"ville" + 0.010*"travail" + 0.009*"mieux" '
  '+ 0.007*"enfants" + 0.007*"qualite" + 0.006*"projets" + 0.006*"chaque" + '
  '0.005*"loi"'),
 (2,
  '0.018*"etre" + 0.011*"france" + 0.011*"pays" + 0.010*"ecologique" + '
  '0.008*"transition" + 0.008*"etat" + 0.008*"niveau" + 0.007*"cette" + '
  '0.007*"politique" + 0.006*"meme"'),
 (3,
  '0.026*"produits" + 0.016*"interdire" + 0.016*"consommation" + '
  '0.015*"limiter" + 0.015*"agriculture" + 0.015*"favoriser" + '
  '0.014*"energies" + 0.013*"production" + 0.013*"reduire" + 0.012*"arreter"'),
 (4,
  '0.024*"transports" + 0.018*"pollution" + 0.017*"transport" + 0.017*"taxer" '
  '+ 0.016*"entreprises" + 0.013*"developper" + 0.013*"vehicules" + '
  '0.012*"commun" + 0.011*"voiture" + 0.010*"villes"')]

Perplexity:  -8.

In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

Après avoir construit des sujets en utilisant différentes colonnes, la colonne "probleme" a donné les meilleurs résultats et les sujets les plus compréhensibles. De plus, elle présentait le meilleur facteur de cohérence.

### NOTE : ce qui serais interessant c'est de faire une comparaison lda avec la version clean et version lemmetized ... à faire dans le future