In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords as stop_words
from textblob import Word
import pycountry
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import gensim
from gensim import corpora, models



import pandas as pd
import os
import re
import hdf5_getters as getters
import requests
from bs4 import BeautifulSoup
import numpy as np
from collections import OrderedDict

import json

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/arnaud/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/arnaud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/arnaud/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/arnaud/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def get_language_full_name(isocode):
    return pycountry.languages.get(alpha_2=isocode).name.lower()

In [4]:
stop_words_languages = {}
names_pd = pd.read_csv('data/firstname.csv')

names = []
for name in names_pd.values:
    names.append(name[0])

In [5]:
def tokenize(text):
    return TextBlob(text)

In [6]:
def remove_stopwords(blob, language):
    if language not in stop_words_languages:
        stop_words_languages[language] = set(stop_words.words(get_language_full_name(language)))
        if language == 'en':
            stop_words_languages[language] |= set(['na', 'gon', 'la', 'nt', 'i', '', "'"])
            stop_words_languages[language] |= set(names)
        
    tokens = []
    for word, tag in blob.tags:
        lower = word.lower().replace("'", '')
        if lower not in stop_words_languages[language]:
            tokens.append((lower, tag))
    return tokens

In [7]:
def lemmatize(tokens):
    lemmas = ''
    lemma = None
    for token, tag in tokens:
        if tag[0] == "V": #if the word is a verb
            lemma = Word(token).lemmatize("v") #we lemmatize it accordingly (for instance, removes -ing or -ed)
        else:
            lemma = Word(token).lemmatize()
        if lemma:
            lemmas += ' ' + lemma
    return lemmas

In [8]:
def get_final_tokens(lyrics):
    texts = []
    for lyric in lyrics:
        texts.append(lemmatize(remove_stopwords(tokenize(lyric), 'en')))
    return texts

In [9]:
def get_lyrics_corpus(texts):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return dictionary, corpus

In [10]:
lyrics_df = pd.read_csv('data/data_lyrics.csv')
lyrics_df.set_index(['track_id'], inplace=True)

lyrics_df = lyrics_df[lyrics_df.lang == 'en']


In [11]:
lyrics_df['tokens'] = get_final_tokens(lyrics_df.lyrics.values)

In [13]:
path = r'mallet/lyrics' 
if not os.path.exists(path):
    os.makedirs(path)

In [14]:
for index, row in lyrics_df.iterrows():
    with open(path + '/' + index + '.txt', "w") as text_file:
        text_file.write(row.tokens)

Run the following command :


bin/mallet import-dir --input mallet/* --output lyrics.mallet --remove-stopwords --keep-sequence

And then this command :


bin/mallet train-topics --input lyrics.mallet --num-topics 40 --num-iterations 1000 --optimize-interval 10 --output-topic-keys topics_composition.txt --output-doc-topics songs_composition.txt

In [None]:
lyrics_df['tokens'] = get_final_tokens(lyrics_df.lyrics.values)

In [None]:
texts = []
for list_tokens in lyrics_df['tokens']:
    texts.append(list_tokens)

In [None]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=1000)

In [None]:
print(ldamodel.print_topics(num_words = 7))

In [None]:
path = r'data/topics' 
if not os.path.exists(path):
    os.makedirs(path)
ldamodel.save('data/topics/topics.lda')

In [None]:
ldamodel = gensim.models.LdaModel.load('data/topics/topics.lda')

In [None]:
ques_vec = dictionary.doc2bow(['hello'])
topic = ldamodel[ques_vec]
ldamodel.print_topic(topic[0][0])
