In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords as stop_words
from textblob import Word
import pycountry
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import gensim
from gensim import corpora, models



import pandas as pd
import os
import re
import hdf5_getters as getters
import requests
from bs4 import BeautifulSoup
import numpy as np
from collections import OrderedDict

import json

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
def get_language_full_name(isocode):
    return pycountry.languages.get(alpha_2=isocode).name.lower()

In [None]:
stop_words_languages = {}

In [None]:
def tokenize(text):
    return TextBlob(text)

In [None]:
def remove_stopwords(blob, language):
    if language not in stop_words_languages:
        stop_words_languages[language] = set(stop_words.words(get_language_full_name(language)))
        if language == 'en':
            stop_words_languages[language] |= set(['na', 'gon', 'la', 'nt', 'i', '', "'"])
        
    tokens = []
    for word, tag in blob.tags:
        lower = word.lower().replace("'", '')
        if lower not in stop_words_languages[language]:
            tokens.append((lower, tag))
    return tokens

In [None]:
def lemmatize(tokens):
    lemmas = []
    lemma = None
    for token, tag in tokens:
        if tag[0] == "V": #if the word is a verb
            lemma = Word(token).lemmatize("v") #we lemmatize it accordingly (for instance, removes -ing or -ed)
        else:
            lemma = Word(token).lemmatize()
        lemmas.append(lemma)
    return lemmas

In [None]:
def get_final_tokens(lyrics):
    texts = []
    for lyric in lyrics:
        texts.append(lemmatize(remove_stopwords(tokenize(lyric), 'en')))
    return texts

In [None]:
def get_lyrics_corpus(texts):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return dictionary, corpus

In [None]:
lyrics_df = pd.read_csv('data/data_lyrics.csv')
lyrics_df.set_index(['track_id'], inplace=True)

lyrics_df = lyrics_df[lyrics_df.lang == 'en']


In [None]:
lyrics_df['tokens'] = get_final_tokens(lyrics_df.lyrics.values)

In [None]:
texts = []
for list_tokens in lyrics_df['tokens']:
    texts.append(list_tokens)

In [None]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary, passes=10)

In [None]:
print(ldamodel.print_topics(num_topics = 1, num_words = 3))

In [None]:
path = r'data/topics' 
if not os.path.exists(path):
    os.makedirs(path)
ldamodel.save('data/topics/topics.lda')

In [None]:
ldamodel = gensim.models.LdaModel.load('data/topics/topics.lda')

In [None]:
ques_vec = dictionary.doc2bow(['hello'])
topic = ldamodel[ques_vec]
ldamodel.print_topic(topic[0][0])
