<a href="https://colab.research.google.com/github/ProfAI/nlp00/blob/master/7%20-%20Topic%20modelling/topic_modelling_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic modelling

https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/SYBGZL

In [14]:
!wget https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv

--2019-04-16 16:05:50--  https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 54096356 (52M) [text/plain]
Saving to: ‘abcnews-date-text.csv’


2019-04-16 16:05:51 (160 MB/s) - ‘abcnews-date-text.csv’ saved [54096356/54096356]



In [22]:
import pandas as pd
from tqdm import tqdm

headlines_df = pd.read_csv("abcnews-date-text.csv")
headlines_df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [16]:
headlines_df = headlines_df.sample(frac=.1, random_state=0)
headlines_df.shape

(108217, 2)

In [18]:
headlines = headlines_df["headline_processed"].values
headlines[:3]

array(['labor attacks nationals mp over milk link',
       'coraki public school hip hop video', 'dairy record'], dtype=object)

## Bag of Words

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(max_features=5000, stop_words="english")
X = bow.fit_transform(headlines)
X.shape

(110366, 5000)

In [0]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=10, max_iter=10, verbose=True)
data = lda.fit_transform(X)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


array([[0.02500519, 0.025     , 0.025     , ..., 0.025     , 0.02500579,
        0.52500008],
       [0.01667313, 0.01668382, 0.01666667, ..., 0.01666695, 0.01666964,
        0.51664108],
       [0.3756053 , 0.02000043, 0.02      , ..., 0.02000127, 0.02000513,
        0.02      ],
       ...,
       [0.01667285, 0.01666707, 0.183333  , ..., 0.01666667, 0.01666711,
        0.01666667],
       [0.01666695, 0.01666667, 0.01666667, ..., 0.01666667, 0.5166664 ,
        0.3499999 ],
       [0.05      , 0.05      , 0.05      , ..., 0.54999104, 0.05      ,
        0.05      ]])

In [0]:
bow.get_feature_names()[:10]

['10', '100', '1000', '100m', '10m', '11', '12', '13', '14', '15']

In [0]:
lda.components_[0].shape

(5000,)

In [0]:
topic = lda.components_[0]
top_words = topic.argsort()[-10:]

In [0]:
for i in top_words:
  print(bow.get_feature_names()[i])

threat
pay
port
open
deal
opposition
house
health
school


In [0]:
n_words = 15

for index, topic in enumerate(lda.components_):
  print("\nTOPIC %d - %d parole più popolari" % (index+1, n_words))
  print([bow.get_feature_names()[i] for i in topic.argsort()[-n_words:]])


TOPIC 1 - 15 parole più popolari

TOPIC 2 - 15 parole più popolari
['says', 'report', 'risk', 'tasmanian', 'workers', 'climate', 'work', 'continue', 'debate', 'continues', 'final', 'study', 'missing', 'change', 'search']

TOPIC 3 - 15 parole più popolari
['residents', 'urges', 'rural', 'help', 'flood', 'guilty', 'public', 'calls', 'sa', 'farmers', 'urged', 'qld', 'home', 'govt', 'water']

TOPIC 4 - 15 parole più popolari
['mp', 'ban', 'government', 'industry', 'australian', 'iraq', 'talks', 'pm', 'claims', 'election', 'labor', 'says', 'minister', 'govt', 'australia']

TOPIC 5 - 15 parole più popolari
['bomb', 'pakistan', 'plane', 'gas', 'kills', 'china', 'bail', 'people', 'car', 'attack', 'crash', 'injured', 'woman', 'dead', 'killed']

TOPIC 6 - 15 parole più popolari
['race', 'long', 'nrl', 'media', 'return', 'man', 'face', 'told', 'drug', 'faces', 'charges', 'sydney', 'accused', 'court', 'interview']

TOPIC 7 - 15 parole più popolari
['lead', 'station', 'car', 'rate', 'cut', 'queens

## TF-IDF

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X = tfidf.fit_transform(headlines)
X.shape

(110366, 5000)

In [0]:
lda = LDA(n_components=10, max_iter=10, verbose=True)
data = lda.fit_transform(X)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [0]:
n_words = 15

for index, topic in enumerate(lda.components_):
  print("\nTOPIC %d - %d parole più popolari" % (index+1, n_words))
  print([bow.get_feature_names()[i] for i in topic.argsort()[-n_words:]])


TOPIC 1 - 15 parole più popolari
['film', 'drug', 'students', 'new', 'train', 'art', 'sri', 'cleared', '2014', '2015', 'broken', 'festival', 'hill', 'hour', 'country']

TOPIC 2 - 15 parole più popolari
['mark', 'tour', 'new', 'political', 'drum', 'minister', 'john', 'markets', 'abbott', 'nrn', 'takes', 'australia', 'climate', 'says', 'change']

TOPIC 3 - 15 parole più popolari
['river', 'bush', 'bushfire', 'flood', 'new', 'flu', 'korea', 'trial', 'asylum', 'cyclone', 'east', 'west', 'coast', 'south', 'north']

TOPIC 4 - 15 parole più popolari
['united', 'grand', 'election', 'cup', 'labor', 'cattle', 'released', 'carbon', 'party', 'aussies', 'tax', 'victory', 'final', 'trade', 'win']

TOPIC 5 - 15 parole più popolari
['development', 'councils', 'regional', 'group', 'energy', 'pacific', 'study', 'water', 'farm', 'urged', 'new', 'funds', 'plan', 'govt', 'council']

TOPIC 6 - 15 parole più popolari
['assault', 'child', 'search', 'jailed', 'charges', 'death', 'woman', 'sex', 'accused', 'mi

## Visualizzare il modello

In [0]:
!pip install pyldavis

Collecting pyldavis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 14.6MB/s 
Collecting funcy (from pyldavis)
  Downloading https://files.pythonhosted.org/packages/47/a4/204fa23012e913839c2da4514b92f17da82bf5fc8c2c3d902fa3fa3c6eec/funcy-1.11-py2.py3-none-any.whl
Building wheels for collected packages: pyldavis
  Building wheel for pyldavis (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyldavis
Installing collected packages: funcy, pyldavis
Successfully installed funcy-1.11 pyldavis-2.1.2


In [0]:
import pyLDAvis.sklearn

pyLDAvis.sklearn.prepare(lda, X, tfidf, mds='tsne')

ModuleNotFoundError: ignored