# Parameters tuning for LDA

In [3]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet
import re

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/stefanoperenzoni/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
comments_category_filtered = pd.read_csv("../data/GB_comments_filtered.csv")

  and should_run_async(code)


In [3]:
# Bag of words
vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
LDA = LatentDirichletAllocation(n_components=3,random_state=1, learning_method="online")

dtm = vect.fit_transform(comments_category_filtered['comment_text'])

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #0 : 
['think', 'time', 'did', 'better', 'im', 'sounds', 'make', 'god', 'people', 'thank', 'shit', 'beautiful', 'new', 'great', 'video', 'really', 'good', 'music', 'song', 'like']
topic #1 : 
['tell', 'heart', 'oh', 'want', 'way', 'nyou', 'little', 'girl', 'time', 'nand', 'let', 'right', 'll', 'got', 'just', 'fuck', 'don', 'like', 'know', 'ni']
topic #2 : 
['album', 'don', 'perfect', 'day', 'views', 'hear', 'awesome', 'voice', 'wait', 'omg', 'fucking', 'man', 'guys', 'songs', 'video', 'best', 'amazing', 'just', 'song', 'love']


In [4]:
# tf-idf
vect = TfidfVectorizer(stop_words='english')
LDA = LatentDirichletAllocation(n_components=3,random_state=1, learning_method="online")

dtm = vect.fit_transform(comments_category_filtered['comment_text'])

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #1 : 
['watch', 'oh', 'new', 'really', 'nice', 'cool', 'way', 'views', 'want', 'god', 'just', 'sounds', 'better', 'music', 'voice', 'good', 'video', 'beautiful', 'song', 'like']
topic #2 : 
['rock', 'wtf', 'baby', 'taylor', 'fighting', 'yes', 'dance', 'reminds', 'ass', 'mv', 'tho', 'trending', 'waiting', 'sam', 'loved', 'tom', 'girl', 'perfect', 'wait', 'omg']
topic #3 : 
['time', 've', 'really', 'im', 'man', 'thank', 'wow', 'fucking', 'awesome', 'good', 'video', 'like', 'music', 'just', 'shit', 'great', 'best', 'amazing', 'song', 'love']


In [8]:
#Bag of words parameter tuning

'''
n_components=5,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1

Saw 'lyrics' for the first time.
'''

# Bag of words
vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
LDA = LatentDirichletAllocation(n_components=5,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1)

dtm = vect.fit_transform(comments_category_filtered['comment_text'])

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #1 : 
['com', 'youtube', 'hope', 'chester', 'makes', 'sound', 'watch', 'look', 'fucking', 'im', 'think', 'god', 'thank', 'time', 'people', 'like', 'way', 'shit', 'amazing', 'good']
topic #2 : 
['nigga', 'black', 'white', 'big', 'ass', 'crying', 'want', 'dance', 'rip', 'lot', 'long', 'wanna', 'baby', 'need', 'll', 'little', 'real', 'just', 'know', 'new']
topic #3 : 
['fan', 'yeah', 've', 'right', 'come', 'happy', 'life', 'nyou', 'know', 'wait', 'omg', 'nand', 'let', 'day', 'fuck', 'like', 'got', 'don', 'ni', 'just']
topic #4 : 
['beat', 'hear', 'lol', 'thing', 'want', 'views', 'world', 'album', 'say', 'sounds', 'make', 'good', 'really', 'best', 'voice', 'great', 'music', 'song', 'like', 'video']
topic #5 : 
['version', 've', 'days', 'proud', 'sam', 'times', 'guys', 'favorite', 'really', 'thanks', 'listening', 'stop', 'perfect', 'heart', 'girl', 'better', 'songs', 'beautiful', 'song', 'love']


In [10]:
#Bag of words parameter tuning

'''
n_components=4,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1
'''

# Bag of words
vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
LDA = LatentDirichletAllocation(n_components=3,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1)

dtm = vect.fit_transform(comments_category_filtered['comment_text'])

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #1 : 
['make', 'sounds', 'god', 'thank', 'album', 'just', 'songs', 'voice', 'shit', 'new', 'amazing', 'beautiful', 'great', 'really', 'video', 'good', 'music', 'like', 'love', 'song']
topic #2 : 
['heart', 'time', 'nyou', 'little', 'want', 'let', 'girl', 'nand', 'way', 'real', 'll', 'got', 'right', 'just', 'don', 'love', 'fuck', 'like', 'know', 'ni']
topic #3 : 
['hard', 'happy', 'like', 'listening', 'got', 'stop', 'perfect', 'views', 'video', 'hear', 'man', 'time', 'omg', 'fucking', 'don', 'day', 'better', 'people', 'best', 'just']


In [14]:
#Guided LDA
import guidedlda

vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
dtm = vect.fit_transform(comments_category_filtered['comment_text'])

vocab = vect.vocabulary_
word2id = dict((v, idx) for idx, v in enumerate(vocab))

#Create topics
'''
seed_topic_list = [
    ["lyrics", "text", "write", "read"],
    ["video", "look", "watch", "dance"],
    ["voice", "hear", "sing", "sound"]
]
'''

seed_topic_list = [
    ["lyrics", "text"],
    ["video", "look"],
    ["voice", "hear"]
]

guidedLDA = guidedlda.GuidedLDA(n_topics=3, n_iter=100, random_state=1, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

#model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)
guidedLDA.fit(dtm, seed_topics=seed_topics, seed_confidence=0.15)

for index,topic in enumerate(guidedLDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(guidedLDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
INFO:guidedlda:n_documents: 51787
INFO:guidedlda:vocab_size: 13596
INFO:guidedlda:n_words: 385947
INFO:guidedlda:n_topics: 3
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -3871594
INFO:guidedlda:<20> log likelihood: -3131628
INFO:guidedlda:<40> log likelihood: -3106314
INFO:guidedlda:<60> log likelihood: -3093068
INFO:guidedlda:<80> log likelihood: -3082529
INFO:guidedlda:<99> log likelihood: -3074038
topic #1 : 
['guys', 'shit', 'album', 'don', 'songs', 'voice', 'new', 'best', 'fuck', 'beautiful', 'really', 'great', 'amazing', 'good', 'just', 'music', 'video', 'like', 'love', 'song']
topic #2 : 
['voice', 'album', 'think', 'don', 'new', 'beautiful', 'know', 'really', 'amazing', 'time', 'best', 'great', 'people', 'good', 'video', 'music', 'just', 'like', 'love', 'song']
topic #3 : 
['oh', 'right', 'let', 'little', 'want', 've', 'time', 'got', 'way', 'll', 'nyou', 'good', 'nand', 'don', 'know', 'just', 'song', 'like', 'love', 'ni']


ValidationError: 
 * Not all rows (distributions) in doc_topic_dists sum to 1.

In [28]:
# Preprocessing in vectoriser
# POS, lemmatiser
def get_wordnet_pos(word):
    '''tags parts of speech to tokens
    Expects a string and outputs the string and 
    its part of speech'''
    
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def word_lemmatizer(text):
    '''lemamtizes the tokens based on their part of speech'''
    
    lemmatizer = WordNetLemmatizer()
    text = lemmatizer.lemmatize(text, get_wordnet_pos(text))
    return text


def reflection_tokenizer(text):
    '''expects a string an returns a list of lemmatized tokens 
        and removes the stop words. Tokens are lower cased and 
        non- alphanumeric characters as well as numbers removed. '''
    text=re.sub(r'[\W_]+', ' ', text) #keeps alphanumeric characters
    text=re.sub(r'\d+', '', text) #removes numbers
    text = text.lower()
    tokens = [word for word in word_tokenize(text)]
    tokens = [word for word in tokens if len(word) >= 3]
    #removes smaller than 3 character
    tokens = [word_lemmatizer(w) for w in tokens]
    #tokens = [s for s in tokens if s not in stop_words]
    return tokens

  and should_run_async(code)


In [33]:
comments_category_filtered['lemmatize_token'] = comments_category_filtered['comment_text'].apply(reflection_tokenizer)

  and should_run_async(code)


In [50]:
ly_count = 0
for ser in comments_category_filtered['lemmatize_token']:
    ly_count += ser.count('lyric')

print(ly_count)

723
  and should_run_async(code)


In [52]:
vect = CountVectorizer(tokenizer=reflection_tokenizer, stop_words='english')
dtm = vect.fit_transform(comments_category_filtered['comment_text'])

  and should_run_async(code)


In [35]:
LDA = LatentDirichletAllocation(n_components=3,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1)

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #1 : 
['guy', 'watch', 'day', 'sound', 'people', 'think', 'listen', 'best', 'time', 'beautiful', 'amaze', 'great', 'really', 'good', 'just', 'make', 'music', 'video', 'like', 'song']
topic #2 : 
['sam', 'thank', 'yes', 'rip', 'vibe', 'track', 'come', 'boy', 'tom', 'use', 'look', 'chester', 'heart', 'miss', 'omg', 'new', 'like', 'wait', 'album', 'love']
topic #3 : 
['beat', 'perfect', 'want', 'gon', 'come', 'nyou', 'right', 'look', 'nand', 'say', 'real', 'don', 'let', 'girl', 'just', 'voice', 'know', 'shit', 'like', 'fuck']


In [36]:
LDA = LatentDirichletAllocation(n_components=5,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1)

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #1 : 
['think', 'guy', 'day', 'people', 'listen', 'best', 'beautiful', 'amaze', 'great', 'really', 'fuck', 'time', 'good', 'make', 'music', 'video', 'like', 'just', 'love', 'song']
topic #2 : 
['rip', 'track', 'big', 'use', 'boy', 'album', 'look', 'real', 'feel', 'right', 'lyric', 'heart', 'omg', 'sound', 'come', 'thank', 'love', 'wait', 'new', 'like']
topic #3 : 
['dude', 'isn', 'bitch', 'sure', 'remember', 'pretty', 'stream', 'vibe', 'nigga', 'just', 'cool', 'talk', 'thanks', 'damn', 'like', 'dance', 'look', 'perfect', 'girl', 'shit']
topic #4 : 
['cute', 'drop', 'nbut', 'way', 'kill', 'tom', 'long', 'baby', 'wan', 'say', 'want', 'live', 'beat', 'nyou', 'little', 'let', 'gon', 'nand', 'voice', 'know']
topic #5 : 
['www', 'wrong', 'kid', 'legend', 'soul', 'fight', 'care', 'twice', 'tear', 'die', 'word', 'version', 'need', 'lose', 'http', 'com', 'youtube', 'watch', 'work', 'don']


In [53]:
vect = CountVectorizer(tokenizer=reflection_tokenizer, stop_words='english')
dtm = vect.fit_transform(comments_category_filtered['comment_text'])

vocab = vect.vocabulary_
word2id = dict((v, idx) for idx, v in enumerate(vocab))

#Create topics
'''
seed_topic_list = [
    ["lyrics", "text", "write", "read"],
    ["video", "look", "watch", "dance"],
    ["voice", "hear", "sing", "sound"]
]
'''

seed_topic_list = [
    ["lyric", "text"],
    ["video", "look"],
    ["voice", "hear"]
]

guidedLDA = guidedlda.GuidedLDA(n_topics=3, n_iter=100, random_state=1, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

#model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)
guidedLDA.fit(dtm, seed_topics=seed_topics, seed_confidence=0.15)

for index,topic in enumerate(guidedLDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

  and should_run_async(code)
INFO:guidedlda:n_documents: 51787
INFO:guidedlda:vocab_size: 24866
INFO:guidedlda:n_words: 371852
INFO:guidedlda:n_topics: 3
INFO:guidedlda:n_iter: 100
INFO:guidedlda:<0> log likelihood: -3718277
INFO:guidedlda:<20> log likelihood: -3017133
INFO:guidedlda:<40> log likelihood: -2993393
INFO:guidedlda:<60> log likelihood: -2981483
INFO:guidedlda:<80> log likelihood: -2971585
INFO:guidedlda:<99> log likelihood: -2965144
topic #1 : 
['want', 'look', 'come', 'think', 'music', 'nyou', 'say', 'nand', 'way', 'video', 'feel', 'don', 'make', 'time', 'good', 'know', 'just', 'like', 'song', 'love']
topic #2 : 
['new', 'girl', 'know', 'beautiful', 'great', 'amaze', 'best', 'really', 'shit', 'look', 'come', 'good', 'music', 'make', 'just', 'video', 'fuck', 'like', 'love', 'song']
topic #3 : 
['people', 'new', 'voice', 'listen', 'best', 'guy', 'beautiful', 'amaze', 'time', 'sound', 'really', 'great', 'good', 'make', 'just', 'music', 'video', 'like', 'love', 'song']


In [56]:
#Bigrams or Trigrams
vect_2 = CountVectorizer(tokenizer=reflection_tokenizer, stop_words='english', ngram_range=(1,2))
dtm_2 = vect_2.fit_transform(comments_category_filtered['comment_text'])

  and should_run_async(code)


In [57]:
LDA_2 = LatentDirichletAllocation(n_components=5,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1)

LDA_2.fit(dtm_2)

for index,topic in enumerate(LDA_2.components_):
    print(f'topic #{index+1} : ')
    print([vect_2.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis_2 = pyLDAvis.sklearn.prepare(LDA_2, dtm_2, vect_2)
pyLDAvis.display(vis_2)

  and should_run_async(code)


KeyboardInterrupt: 

In [None]:
# Only NOUN in LDA

In [15]:
# Preprocessing in vectoriser
# POS, lemmatiser
def get_wordnet_pos_noun(word):
    '''tags parts of speech to tokens
    Expects a string and outputs the string and 
    its part of speech'''
    
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.VERB)


def word_lemmatizer_noun(text):
    '''lemamtizes the tokens based on their part of speech'''
    
    lemmatizer = WordNetLemmatizer()
    text = lemmatizer.lemmatize(text, get_wordnet_pos_noun(text))
    return text


def reflection_tokenizer_noun(text):
    '''expects a string an returns a list of lemmatized tokens 
        and removes the stop words. Tokens are lower cased and 
        non- alphanumeric characters as well as numbers removed. '''
    text=re.sub(r'[\W_]+', ' ', text) #keeps alphanumeric characters
    text=re.sub(r'\d+', '', text) #removes numbers
    text = text.lower()
    tokens = [word for word in word_tokenize(text)]
    tokens = [word for word in tokens if len(word) >= 3]
    #removes smaller than 3 character
    to_rtn_tokens = []
    for w in tokens:
        if get_wordnet_pos_noun(w) == wordnet.NOUN:
            to_rtn_tokens.append(word_lemmatizer_noun(w))
    #tokens = [word_lemmatizer(w) for w in tokens]
    #tokens = [s for s in tokens if s not in stop_words]
    return to_rtn_tokens

  and should_run_async(code)


In [16]:
comments_category_filtered['lemmatize_noun'] = comments_category_filtered['comment_text'].apply(reflection_tokenizer_noun)

  and should_run_async(code)


In [17]:
comments_category_filtered['lemmatize_noun'].head()

  and should_run_async(code)


0                                         [didn, song]
1                                     [alright, stair]
2    [ground, cry, protest, worshipper, science, lo...
3                                  [hope, exist, year]
4                                 [song, exist, floor]
Name: lemmatize_noun, dtype: object

In [18]:
vect = CountVectorizer(tokenizer=reflection_tokenizer_noun, stop_words='english')
dtm = vect.fit_transform(comments_category_filtered['comment_text'])

  and should_run_async(code)


In [19]:
LDA = LatentDirichletAllocation(n_components=5,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1)

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #1 : 
['hand', 'rapper', 'drop', 'sorry', 'ain', 'legend', 'tho', 'play', 'cute', 'yes', 'rip', 'track', 'lot', 'boy', 'thought', 'wow', 'awesome', 'girl', 'album', 'shit']
topic #2 : 
['hey', 'help', 'niggerfaggot', 'look', 'blm', 'end', 'channel', 'nigga', 'comment', 'lyric', 'http', 'damn', 'music', 'com', 'youtube', 'watch', 'wait', 'fuck', 'guy', 'video']
topic #3 : 
['hope', 'year', 'world', 'chester', 'don', 'time', 'hear', 'heart', 'omg', 'fan', 'think', 'need', 'view', 'life', 'thank', 'video', 'day', 'people', 'love', 'song']
topic #4 : 
['need', 'cause', 'tell', 'lol', 'yeah', 'wan', 'didn', 'stop', 'baby', 'perfect', 'nyou', 'gon', 'nand', 'want', 'look', 'right', 'feel', 'time', 'way', 'don']
topic #5 : 
['version', 'dance', 'sam', 'feel', 'time', 'cool', 'tom', 'favorite', 'band', 'heard', 'thanks', 'thing', 'video', 'man', 'god', 'sound', 'love', 'voice', 'beautiful', 'music']


In [20]:
LDA = LatentDirichletAllocation(n_components=4,
max_iter=10, 
learning_method='online',
random_state=1,
batch_size=128,
evaluate_every = -1,
n_jobs = -1)

LDA.fit(dtm)

for index,topic in enumerate(LDA.components_):
    print(f'topic #{index+1} : ')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(LDA, dtm, vect)
pyLDAvis.display(vis)

  and should_run_async(code)
topic #1 : 
['cute', 'version', 'niggerfaggot', 'blm', 'proud', 'yes', 'cause', 'track', 'cool', 'lot', 'yeah', 'band', 'heard', 'boy', 'love', 'nyou', 'girl', 'album', 'fuck', 'shit']
topic #2 : 
['singer', 'comment', 'beat', 'word', 'sam', 'nigga', 'end', 'tom', 'thanks', 'http', 'damn', 'com', 'youtube', 'wow', 'watch', 'wait', 'look', 'voice', 'guy', 'video']
topic #3 : 
['hope', 'chester', 'heart', 'listen', 'omg', 'fan', 'life', 'don', 'view', 'time', 'year', 'thank', 'day', 'people', 'sound', 'beautiful', 'video', 'music', 'love', 'song']
topic #4 : 
['wan', 'think', 'didn', 'stop', 'baby', 'perfect', 'lyric', 'hear', 'awesome', 'lol', 'gon', 'nand', 'thing', 'god', 'right', 'want', 'don', 'way', 'time', 'feel']
