# Topic Modeling Lab

### To do list:
- Use sk-learn because
    - It has both LDA and NMF
    - It is the same API they'll use for many stats and ML tasks
    - There's a good example guide here: https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
    
- Steps 
    - preprocessing (use 'clean' data from lab notebook 1)
    - fitting LDA
    - viewing, interpreting topics
    - cross tabs of topics and personal attributes of interest
    - fitting NMF
    - viewing, interpreting results
    - comparing LDA & NMF topics (deal with alignment)
    - cross tabs
    - compare LDA & NMF cross tabs (undermine singular, objective truth)
    
- Functionality
    - Display topics found
    - Undo stems

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from bs4 import BeautifulSoup
import re

In [None]:
profiles = pd.read_csv('data/clean_profiles.tsv', sep='\t')
profiles = profiles.sample(5000)
profiles.head(2)

In [None]:
def clean(text):
    t = BeautifulSoup(text, 'lxml').get_text()
    t = t.lower()
    
    bad_words = ['http', 'www', '\nnan']
    
    for b in bad_words:
        t = t.replace(b, '')
    
    return t

profiles['clean'] = profiles.text.apply(clean)
profiles.head(2)

In [None]:
ntopics = 5
documents = profiles.clean.values

print("Vectorizing text by word counts...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, 
                                max_features=1000, 
                                stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

print('Performing LDA on vectors...')
lda = LatentDirichletAllocation(n_components=ntopics, 
                                max_iter=10, 
                                learning_method='online', 
                                n_jobs=-1 ).fit(tf)

print('Done!')

In [None]:
nshow = 10

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        words = " ".join([feature_names[i] 
                          for i in topic.argsort()[:-no_top_words - 1:-1]])
        print("Topic", topic_idx, ":  ", words)
                        
display_topics(lda, tf_feature_names, no_top_words=nshow)

In [None]:
print("Vectorizing text by TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=1000, 
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

print('Performing NMF on vectors...')
nmf = NMF(n_components=ntopics, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

print('Done!')

In [None]:
display_topics(nmf, tfidf_feature_names, nshow)