In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords as stop_words
from textblob import Word
import pycountry
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from gensim import corpora, models

import pandas as pd
import os
import re
import hdf5_getters as getters
import requests
from bs4 import BeautifulSoup
import numpy as np
from collections import OrderedDict

import json

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
def get_language_full_name(isocode):
    return pycountry.languages.get(alpha_2=isocode).name.lower()

In [None]:
stop_words_languages = {}

In [None]:
def tokenize(text):
    return TextBlob(text)

In [None]:
def remove_stopwords(blob, language):
    if language not in stop_words_languages:
        stop_words_languages[language] = set(stop_words.words(get_language_full_name(language)))
        
    tokens = []
    for word, tag in blob.tags:
        lower = word.lower().replace("'", '')
        if lower not in stop_words_languages[language]:
            tokens.append((lower, tag))
    return tokens

In [None]:
def lemmatize(tokens):
    lemmas = []
    lemma = None
    for token, tag in tokens:
        if tag[0] == "V": #if the word is a verb
            lemma = Word(token).lemmatize("v") #we lemmatize it accordingly (for instance, removes -ing or -ed)
        else:
            lemma = Word(token).lemmatize()
        lemmas.append(lemma)
    return lemmas

In [None]:
def get_final_tokens(lyrics):
    texts = []
    for lyric in lyrics:
        texts.append(lemmatize(remove_stopwords(tokenize(lyric), 'en')))
    return texts

In [None]:
def get_word_freq(texts):
    word_count = {}
    for text in texts:
        for token in text:
            if token not in word_count:
                word_count[token] = 1
            else:
                word_count[token] += 1
    return word_count

In [None]:
def get_lyrics_corpus(texts):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return dictionary, corpus

In [None]:
def get_sentiment(lyrics):
    blob = TextBlob(lyrics)
    return blob.sentiment.polarity

In [None]:
lyrics_df = pd.read_csv('data/data_lyrics.csv')
lyrics_df.set_index(['track_id'], inplace=True)

lyrics_df = lyrics_df[lyrics_df.lang == 'en']
genres_list = []
genres_indices = {}
for index, row in lyrics_df.iterrows():
    genres = row['genres'].split('&')
    for genre in genres:
        genres_indices.setdefault(genre, []).append(index)
        if genre not in genres_list:
            genres_list.append(genre)

In [None]:
lyrics_df['tokens'] = get_final_tokens(lyrics_df.lyrics.values)

In [None]:
all_lyrics = lyrics_df.lyrics.values
sentiments = []
for lyric in all_lyrics:
    sentiments.append(get_sentiment(lyric))
lyrics_df['sentiment'] = sentiments

In [None]:
genres_words = {}
for genre, indices in genres_indices.items():
    curr_df = lyrics_df.loc[indices]
    freqs = get_word_freq(curr_df.tokens.values)
    genres_words[genre] = list(freqs.keys())
    word_count = sum(freqs.values())
    
    directory = 'data/final_data/' + genre
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for year in curr_df[curr_df.year != 0].year.sort_values().unique():
        curr_year_df = curr_df[curr_df.year == year]
        freqs = get_word_freq(curr_year_df.tokens.values)
        word_count = sum(freqs.values())
        
        sentiments = curr_year_df.sentiment.values
        sentiment_avg = sum(sentiments) / float(len(sentiments))
        with open(directory + '/sentiments.csv', 'a') as output_file:
            print(str(year) + ',' + str(sentiment_avg), file=output_file)
        for word, freq in freqs.items():
            with open(directory + '/' + word.replace('/', '-') + '.csv', 'a') as output_file:
                print(str(year) + ',' + str(freq/float(word_count)), file=output_file)

In [None]:
with open('data/final_data/genres_words.json', 'w') as output_file:
    json.dump(genres_words, output_file)

In [None]:
with open('data/final_data/genres_list.json', 'w') as output_file:
    json.dump(genres_list, output_file)