# Latent Dirichlet Allocation

In [1]:
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('data', sep=",", header=None)

data.columns = ['text']

data

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...
...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...


The data is a collection of emails that are not labelled. Let's try extract topics from them!

## Preprocessing 

Cleaning up and storing the cleaned text in a new dataframe column "clean_text".

In [3]:
def clear_punctuation_lower(text):    
    import string
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text.lower()

In [4]:
data[['clean_text']] = pd.DataFrame(data.text.apply(clear_punctuation_lower))
data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,from gldcunixbcccolumbiaedu gary l dare\nsubje...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,from atterlepvelaacsoaklandedu cardinal ximene...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,from minerkuhubccukansedu\nsubject re ancient ...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,from atterlepvelaacsoaklandedu cardinal ximene...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,from vzhivovsuperiorcarletonca vladimir zhivov...
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,from jerrybeskimocom jerry kaufman\nsubject re...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,from golchowyalchemychemutorontoca gerald olch...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,from jaynemmaltguildorg jayne kulikauskas\nsub...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,from sclarkepasutorontoca susan clark\nsubject...


In [5]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    
    word_tokens = word_tokenize(text)
    
    text = [word for word in word_tokens if not word in stop_words]
    
    return ' '.join(word for word in text)

In [6]:
data[['clean_text']] = pd.DataFrame(data.clean_text.apply(remove_stopwords))
data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,gldcunixbcccolumbiaedu gary l dare subject sta...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,minerkuhubccukansedu subject ancient books org...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,vzhivovsuperiorcarletonca vladimir zhivov subj...
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,jerrybeskimocom jerry kaufman subject prayers ...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,golchowyalchemychemutorontoca gerald olchowy s...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,jaynemmaltguildorg jayne kulikauskas subject q...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,sclarkepasutorontoca susan clark subject picks...


In [7]:
#Remove numbers
data[['clean_text']] = pd.DataFrame(data.clean_text.apply(
    lambda text: ''.join(word for word in text if not word.isdigit())))
data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,gldcunixbcccolumbiaedu gary l dare subject sta...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,minerkuhubccukansedu subject ancient books org...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,atterlepvelaacsoaklandedu cardinal ximenez sub...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,vzhivovsuperiorcarletonca vladimir zhivov subj...
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,jerrybeskimocom jerry kaufman subject prayers ...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,golchowyalchemychemutorontoca gerald olchowy s...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,jaynemmaltguildorg jayne kulikauskas subject q...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,sclarkepasutorontoca susan clark subject picks...


## Latent Dirichlet Allocation model

ðŸ‘‡ Train an LDA model to extract potential topics.

In [8]:
vectorizer = TfidfVectorizer()

vectorizer = vectorizer.fit(data['clean_text'])

data_vectorized = vectorizer.transform(data['clean_text'])

model = LatentDirichletAllocation(n_components=3)

model = model.fit(data_vectorized)

## Visualize potential topics

ðŸ‘‡ The function to print the words associated with the potential topics is already made for you. You just have to pass the correct arguments!

In [9]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        
print_topics(model, vectorizer)

Topic 0:
[('god', 29.648549795832732), ('jesus', 18.386177877342753), ('would', 16.874340914198072), ('people', 16.29487738070984), ('one', 15.105899804424338), ('church', 14.533699364868031), ('christians', 13.76563566152305), ('bible', 13.36165881855179), ('believe', 12.977136755241732), ('christian', 12.387040122113254)]
Topic 1:
[('gakwrscom', 0.7990913160079914), ('stueven', 0.6352463682798939), ('mailnews', 0.5938953547016033), ('returnpath', 0.5938953546964545), ('dohertyldcsglaacuk', 0.5938953546874411), ('xmailer', 0.5938953546861707), ('dohertylukacgladcs', 0.5938953546817509), ('dohertyl', 0.5938953546026603), ('moyacfnyuedu', 0.5524287282611464), ('contriversy', 0.5524287282462503)]
Topic 2:
[('game', 19.18368261473907), ('team', 19.13970786650248), ('hockey', 18.09006747322452), ('nhl', 13.271708660861679), ('play', 12.971973958682964), ('players', 12.807647179695472), ('go', 12.359677222840975), ('university', 12.080040233255213), ('nntppostinghost', 11.758749778002173), 

## Predict topic of new text

ðŸ‘‡ You can now use your LDA model to predict the topic of a new text. First, use your vectorizer to vectorize the example. Then, use your LDA model to predict the topic of the vectorized example.

In [10]:
example = ["i'm going to play soccer on sunday"]

example_vectorized = vectorizer.transform(example)

lda_vectors = model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])
print("topic 2 :", lda_vectors[0][2])

topic 0 : 0.11988865139826066
topic 1 : 0.11319302243744522
topic 2 : 0.7669183261642941
