In [None]:
# This files contains the functions for training, applying and evaluating the Naïve Bayes algorithm.
# The function "apply_multinomial_NB_lyrics" is copied in apply_nb.py. The values of the P_t_c and P_c dictionaries
# were taken from this notebook.

In [4]:
import csv

def genre_to_information_from_csv(file):
    '''
    Input: A csv file.
    Output: A dictionary genre_to_all_song_lyrics that links a genre to all its songtext. A dictionary genre_to_amount_of_songs
    that links a genre to the amount of songs in the corresponding genre. A dictionary genre_to_pairs which links a genre to
    all the (artist, song_title) pairs of that genre. A dictionary pair_to_lyrics that links a (artist, song_title) pair to
    the corresponding lyrics.
    '''
    genre_to_all_song_lyrics = {}
    genre_to_amount_of_songs = {}
    genre_to_pairs = {}
    pair_to_lyrics = {}
    with open(file, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for i,row in enumerate(reader):
            # Skip row one and two (table headers)
            if i is 0:
                continue
            if i is 1:
                continue
            if i == 5000:
                break
            song_title = row[0]
            artist = row[2]
            genre = row[3]
            # Since the lyrics in the csv have already been normalized, we don't need to remove stopwords or lowercase
            lyrics = row[5]
            pair_to_lyrics[(artist, song_title)] = lyrics
            lyrics_new = lyrics.split()
            if genre in genre_to_pairs:
                genre_to_pairs[genre].append((artist, song_title))
            else:
                genre_to_pairs[genre] = [(artist, song_title)]
            if genre in genre_to_all_song_lyrics:
                genre_to_all_song_lyrics[genre].extend(lyrics_new)
            else:
                genre_to_all_song_lyrics[genre] = lyrics_new
            if genre in genre_to_amount_of_songs:
                genre_to_amount_of_songs[genre] += 1
            else:
                genre_to_amount_of_songs[genre] = 1
    return genre_to_all_song_lyrics, genre_to_amount_of_songs, genre_to_pairs, pair_to_lyrics

         
           
genre_to_all_song_lyrics, genre_to_amount_of_songs, genre_to_pairs, pair_to_lyrics = genre_to_information_from_csv('output.csv')

# Get all the genres in the training set
genres = []
for genre in genre_to_all_song_lyrics:
    genres.append(genre)
training_set_length = 0
for genre in genres:
    training_set_length += genre_to_amount_of_songs[genre]
    print(genre, ':', genre_to_amount_of_songs[genre])

Pop : 847
HipHop : 353
Rock : 2120
Metal : 535
Other : 270
Country : 172
Jazz : 316
Electronic : 204
Folk : 103
RB : 8
Indie : 70


In [5]:
# Get the vocabulary of the training set
def get_vocabulary(genre_to_all_song_lyrics):
    v = []
    for genre, lyrics in genre_to_all_song_lyrics.items():
        v.extend(lyrics)
    v = set(v)
    return v

In [6]:
# Train the Naive Base algorithm on the training set
def train_multinomial_NB(genre_to_all_song_lyrics, training_set_length, genre_to_amount_of_songs):
    P_t_c_dict = {}
    P_c_dict = {}
    v = get_vocabulary(genre_to_all_song_lyrics)
    size_vocab = len(v)
    # N is the amount of songs in the training set
    N = training_set_length
    for genre in genre_to_all_song_lyrics:
        # N_c is the amount of songs in a genre
        N_c = genre_to_amount_of_songs[genre]
        P_c = N_c/N
        # If by chance there aren't any documents of a genre and the P_c is 0, we assign a very low 
        # value to P_c, so there are no math errors when we take the log10 of P_c later on.
        if N_c is 0:
            P_c = 0.0000001
            print(genre, 'does not occur in documents')
        P_c_dict[genre] = P_c
        # text_c is all the text of all the songs in one genre
        text_c = genre_to_all_song_lyrics[genre]
        for term in v:
            # Get the occurences of a term in all the text of a class
            T_ct = text_c.count(term)
            P_t_c = (T_ct+1)/(len(text_c)+size_vocab)
            P_t_c_dict[(term, genre)] = P_t_c
    return P_t_c_dict, P_c_dict, v

P_t_c_dict, P_c_dict, vocabulary = train_multinomial_NB(genre_to_all_song_lyrics, training_set_length, genre_to_amount_of_songs)

print("Finished training on the data")

Finished training on the data


In [7]:
with open('out_full_voc.txt', 'w', encoding="utf-8") as f:
    print(P_t_c_dict, P_c_dict, file=f) 

In [8]:
from math import log10
import operator

# Tests the Naive Base algorithm on the testing set
def apply_multinomial_NB(P_t_c_dict, P_c_dict, pair_to_lyrics, artist, song, genres):
    class_score_dict = {}
    # Get song lyrics of song
    lyrics = pair_to_lyrics[(artist, song)]
    # Calculate the probability of the doc occuring in each one of the ministerie
    for genre in genres:
        score = log10(P_c_dict[genre])
        for term in lyrics:
            if (term, genre) in P_t_c_dict:
                P_t_c = P_t_c_dict[(term, genre)]
                score += log10(P_t_c)
        class_score_dict[genre] = score
    high_score_class = max(class_score_dict.items(), key=operator.itemgetter(1))[0]
    high_score_score = max(class_score_dict.items(), key=operator.itemgetter(1))[1]
    return class_score_dict, high_score_class, high_score_score

In [9]:
from math import log2
    
# Calculates the MI value of a term, minsterie pair    
def mi_utility(genre_to_pairs, pair_to_lyrics, term, genre):
    # Keeps track of the amount of times a term occurs in a doc that belongs to the given genre
    N11 = 0
    # Keeps track of the amount of times a term does not occur in a doc that belongs to the given genre
    N01 = 0
    # Keeps track of the amount of times a term occurs in a doc that does not belong to the given genre
    N10 = 0
    # Keeps track of the amount of times a term does not occur in a doc that does not belong to the given genre
    N00 = 0
    pairs = genre_to_pairs[genre]
    for pair, lyrics in pair_to_lyrics.items():
        if term in lyrics:
            # Test if the doc occurs in the right genre
            if pair in pairs:
                N11 += 1
            else:
                N10 += 1
        else:
            if pair in pairs:
                N01 += 1
            else:
                N00 += 1
                
    # Make sure we never divide by 0 and get a math error, even if we have small sets by assigning a value of 1
    # if there are no occurences.
    if N01 is 0:
        N01 = 1
    if N10 is 0:
        N10 = 1
    if N11 is 0:
        N11 = 1
    if N00 is 0:
        N00 = 1            
        
    N = N11 + N01 + N10+ N00
    N1_ = N10 + N11
    N0_ = N01 + N00
    N_0 = N10 + N00
    N_1 = N01 + N11
   
    utility = ( (N11/N) *log2((N*N11)/(N1_*N_1)) ) + ((N01/N) * log2((N*N01)/(N0_*N_1))) + ((N10/N) * log2((N*N10)/(N1_*N_0)) ) + ( (N00/N) * log2((N*N00)/(N0_*N_0)) ) 
    return utility

import operator

# Prints the top 10 words of each class with the highest mutual information score
def print_top_10_of_each_class(genre_to_pairs, pair_to_lyrics, vocabulary, genres):
    print("Top 10 words")
    for genre in genres:
        term_mi_score_dict = {}
        for term in vocabulary:
            mi = mi_utility(genre_to_pairs, pair_to_lyrics, term, genre)
            term_mi_score_dict[term] = mi
            #print(term, ":", mi)
        print()
        print("Genre:", genre)
        print("-----------------------------------------------------")
        for i in range(1,11):
            # Get the key and score of the highest score value in the term_mi_score_dict
            max_key = max(term_mi_score_dict.items(), key=operator.itemgetter(1))[0]
            max_score = max(term_mi_score_dict.items(), key=operator.itemgetter(1))[1]
            # Remove the highest score from term_mi_score_dict
            del term_mi_score_dict[max_key]
            print(i, ':', max_key, '=', max_score)
            
print_top_10_of_each_class(genre_to_pairs, pair_to_lyrics, vocabulary, genres)    

Top 10 words

Genre: Pop
-----------------------------------------------------
1 : beyonc = 0.02462988498266326
2 : yonce = 0.018917672082933074
3 : beyonce = 0.017369575589855683
4 : ã© = 0.009125062170106994
5 : baby = 0.008735502570473851
6 : sabe = 0.007807499136484656
7 : dead = 0.007725700028504642
8 : ï = 0.007695313106103039
9 : îµï = 0.007695313106103039
10 : î¼ = 0.007695313106103039

Genre: HipHop
-----------------------------------------------------
1 : nigga = 0.08082699910810585
2 : shit = 0.06853962660001647
3 : bitch = 0.06432685282279366
4 : hit = 0.061367467251866425
5 : bit = 0.05787348357481208
6 : eazy = 0.05330702832003523
7 : itch = 0.05174015334489744
8 : fuck = 0.04908111214980347
9 : get = 0.04543579557725375
10 : bi = 0.04301657580068088

Genre: Rock
-----------------------------------------------------
1 : nigga = 0.018780645228035506
2 : ä± = 0.017484905868811813
3 : ya = 0.014254217417838354
4 : verse = 0.01317194280908594
5 : bitch = 0.011357700294139012

KeyboardInterrupt: 

In [10]:
from math import log10
import operator


# Tests the Naive Base algorithm on the testing set by giving it lyrics
# This function is used in guess_genre.php
def apply_multinomial_NB_lyrics(P_t_c_dict, P_c_dict, lyrics, genres):
    # First the input is cleaned
    punctuation = re.compile('[{}]+'.format(re.escape(p)))
    stop = set(stopwords.words('english'))
    lyrics = punctuation.sub('', lyrics)
    lyrics = lyrics.split()
    lyrics_new = []
    for w in lyrics:
        if len(w) > 2:
            w = w.lower()
            if w not in stop:
                lyrics_new.append(w)
    class_score_dict = {}
    lyrics_new = lyrics.split()
    # Calculate the probability of the doc occuring in each one of the genres
    for genre in genres:
        score = log10(P_c_dict[genre])
        for term in lyrics_new:
            if (term, genre) in P_t_c_dict:
                P_t_c = P_t_c_dict[(term, genre)]
                score += log10(P_t_c)
        class_score_dict[genre] = score
    high_score_class = max(class_score_dict.items(), key=operator.itemgetter(1))[0]
    high_score_score = max(class_score_dict.items(), key=operator.itemgetter(1))[1]
    return class_score_dict, high_score_class, high_score_score



In [11]:
import csv

def testset_info_from_csv(file, P_t_c_dict, P_c_dict, genres):
    pair_to_genre_testset = {}
    genre_to_pairs_testset = {}
    genre_to_pairs_answer = {}
    with open(file, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for i,row in enumerate(reader):
            # Skip data that was used in the training set
            if i < 5000:
                continue
            if i == 6000:
                break
            song = row[0]
            artist = row[2]
            genre = row[3]
            # Since the lyrics in the csv have already been normalized, we don't need to remove stopwords or lowercase
            lyrics = row[5]
            class_score_dict, test_genre, high_score_score = apply_multinomial_NB_lyrics(P_t_c_dict, P_c_dict, lyrics, genres)
            pair_to_genre_testset[(artist, song)] = test_genre
            if test_genre in genre_to_pairs_testset:
                genre_to_pairs_testset[test_genre].append((artist, song))
            else:
                genre_to_pairs_testset[test_genre] = [(artist, song)]
            if genre in genre_to_pairs_answer:
                genre_to_pairs_answer[genre].append((artist, song))
            else: 
                genre_to_pairs_answer[genre] = [(artist, song)]
    return genre_to_pairs_testset, pair_to_genre_testset, genre_to_pairs_answer

genre_to_pairs_testset, pair_to_genre_testset, genre_to_pairs_answer = testset_info_from_csv('output.csv', P_t_c_dict, P_c_dict, genres)

In [12]:
def get_TP_FN_FP_of_class(pair_to_genre_testset, genre_to_pairs_answer, genre, genre_to_pairs_testset):
    total_TP = 0
    total_FN = 0
    total_FP = 0
    if genre in genre_to_pairs_answer:
        pairs_in_class = genre_to_pairs_answer[genre]
        for pair in pairs_in_class:
            test_genre = pair_to_genre_testset[pair]
            if test_genre is genre:
                total_TP += 1
            else:
                # total_FN is incremented when the doc is actually in a class but the predication is that it's not
                total_FN += 1
    # We collect all docs that belong to the class as marked in the testset
    if genre in genre_to_pairs_testset:
        pairs_in_class_test = genre_to_pairs_testset[genre]
        for pair in pairs_in_class_test:
            # If the doc is not actually a part of the given class we increment total_FP
            if pair not in pairs_in_class_test:
                total_FP += 1
    # To avoid division by zero, we make the values 1 if there are no occurences
    if total_TP is 0:
        total_TP = 1
    if total_FN is 0:
        total_FN = 1
    if total_FP is 0:
        total_FP = 1
    return total_TP, total_FN, total_FP

def calculate_P_R_F1(TP,FN,FP):
    P = TP/(TP+FP)
    R = TP/(TP+FN)
    F1 = (2*P*R)/(P+R)
    return P,R,F1

def microaverage(pair_to_genre_testset, genre_to_pairs, genre_to_pairs_testset, genres):
    agg_TP = 0
    agg_FN = 0
    agg_FP = 0
    for genre in genres:
        TP, FN, FP = get_TP_FN_FP_of_class(pair_to_genre_testset, genre_to_pairs_answer, genre, genre_to_pairs_testset)
        agg_TP += TP
        agg_FN += FN
        agg_FP += FP
    P, R, F1 = calculate_P_R_F1(agg_TP,agg_FN,agg_FP)
    return F1
                                           
                                    
                                           
                                        

In [13]:
def print_P_R_F1_per_class(genres, pair_to_genre_testset, genre_to_pairs_answer, genre_to_pairs_testset):
    for genre in genres:
        TP, FN, FP = get_TP_FN_FP_of_class(pair_to_genre_testset, genre_to_pairs_answer, genre, genre_to_pairs_testset)
        P, R, F1 = calculate_P_R_F1(TP,FN,FP)
        print("--------------------------------------")
        print("Genre: ", genre)
        print("--------------------------------------")
        print("Precision: ", P )
        print("Recall: ", R)
        print("F1-score: ", F1)
    ma = microaverage(pair_to_genre_testset, genre_to_pairs_answer, genre_to_pairs_testset, genres)
    print("Microaverage is:", ma)
    
print_P_R_F1_per_class(genres, pair_to_genre_testset, genre_to_pairs_answer, genre_to_pairs_testset)
                        

--------------------------------------
Genre:  Pop
--------------------------------------
Precision:  0.9473684210526315
Recall:  0.28125
F1-score:  0.43373493975903615
--------------------------------------
Genre:  HipHop
--------------------------------------
Precision:  0.9974489795918368
Recall:  0.8650442477876106
F1-score:  0.9265402843601895
--------------------------------------
Genre:  Rock
--------------------------------------
Precision:  0.9954954954954955
Recall:  0.7673611111111112
F1-score:  0.8666666666666667
--------------------------------------
Genre:  Metal
--------------------------------------
Precision:  0.9375
Recall:  0.17857142857142858
F1-score:  0.30000000000000004
--------------------------------------
Genre:  Other
--------------------------------------
Precision:  0.5
Recall:  0.125
F1-score:  0.2
--------------------------------------
Genre:  Country
--------------------------------------
Precision:  0.5
Recall:  0.3333333333333333
F1-score:  0.4
-------

In [None]:
# Train the Naive Base algorithm on the training set, using the top_100 MI words as the vocabulary
def train_multinomial_NB_voc(genre_to_all_song_lyrics, training_set_length, genre_to_amount_of_songs,top_100):
    P_t_c_dict = {}
    P_c_dict = {}
    
    v = top_100
       
    size_vocab = len(v)
    # N is the amount of songs in the training set
    N = training_set_length
    for genre in genre_to_all_song_lyrics:
        # N_c is the amount of songs in a genre
        N_c = genre_to_amount_of_songs[genre]
        P_c = N_c/N
        # If by chance there aren't any documents of a genre and the P_c is 0, we assign a very low 
        # value to P_c, so there are no math errors when we take the log10 of P_c later on.
        if N_c is 0:
            P_c = 0.0000001
            print(genre, 'does not occur in documents')
        P_c_dict[genre] = P_c
        # text_c is all the text of all the songs in one genre
        text_c = genre_to_all_song_lyrics[genre]
        for term in v:
            # Get the occurences of a term in all the text of a class
            T_ct = text_c.count(term)
            P_t_c = (T_ct+1)/(len(text_c)+size_vocab)
            P_t_c_dict[(term, genre)] = P_t_c
    return P_t_c_dict, P_c_dict, v


In [None]:
import operator

# Returns a list of the top x words for all classes ranked on their mutual information 
def get_top_x_of_all_classes(genre_to_pairs, pair_to_lyrics, genre, genres, x):
    top_x = []
    for genre in genres:
        term_mi_score_dict = {}
        for term in vocabulary:
            mi = mi_utility(genre_to_pairs, pair_to_lyrics, term, genre)
            term_mi_score_dict[term] = mi
        for i in range(x):
            # Get the key of the highest score value in the term_mi_score_dict
            max_key = max(term_mi_score_dict.items(), key=operator.itemgetter(1))[0]
            # Remove the highest score from term_mi_score_dict
            del term_mi_score_dict[max_key]
            top_x.append(max_key)
    return top_x

top_100 = get_top_x_of_all_classes(genre_to_pairs, pair_to_lyrics, genre, genres, 100)
print("We got the top 100")
P_t_c_dict_100, P_c_dict_100, vocabulary = train_multinomial_NB_voc(genre_to_all_song_lyrics, training_set_length, genre_to_amount_of_songs, top_100)

# Evaluate the Naive Bayes using the top 100 words with the highest MI value per genre
genre_to_pairs_testset, pair_to_genre_testset, genre_to_pairs_answer = testset_info_from_csv('output.csv', P_t_c_dict_100, P_c_dict_100, genres)
print_P_R_F1_per_class(genres, pair_to_genre_testset, genre_to_pairs_answer, genre_to_pairs_testset)
                        