In [56]:
import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
import sklearn.metrics as metrics
from imblearn.pipeline import make_pipeline as mp_imb
import pickle

## 1. Functions for step 2

In [57]:
def get_lyrics_list(root_dir, artist_list):
    """
    returns 2D list: 1. Column: lyrics 2. Column: artist
    iterates over the files in each artist folder to extract the lyrics
    """
    master_list = []
    for artist in artist_list:
        for file_name in os.listdir(f"{root_dir}{artist}"):
            text = open(f"{root_dir}{artist}/{file_name}").read()
            text = text.replace('\n', ' ') # replacing \n in the text with whitespace
            text = text.lower()
            master_list.append([text, artist])
    return master_list


In [58]:
def create_dataframe(data):
    """
    creates a Dataframe out of 2D-list with 1. column: lyrics, 2nd column: artist and returns it
    """
    return pd.DataFrame(data, columns=['lyrics_X', 'artist_y'])

In [59]:
def evaluate_model(ytrue, ypred, model, artist_pred_dict, artist_list):
    """
    evaluates the model by accuracy and calculates the presicion, recall and F1 value for each artist
    """
    print(f'{model}: ')
    print(f'accuracy: {round(metrics.accuracy_score(ytrue, ypred), 3)}')
    for artist in artist_list:
        precision = round(metrics.precision_score(artist_pred_dict[artist][0], artist_pred_dict[artist][1]), 3)
        recall = round(metrics.recall_score(artist_pred_dict[artist][0], artist_pred_dict[artist][1]), 3)
        f1_score = round(metrics.f1_score(artist_pred_dict[artist][0], artist_pred_dict[artist][1]), 3)
        print(f"{artist}: precision: {precision}, recall: {recall}, F1 Score: {f1_score}")

In [61]:
def get_prediction_for_artist(yval, ypred, artist_list):
    """
    returns a dictionary
    keys: artists, 
    values: 2 lists 
    1st list: true value for artist (1 or 0) for all situations in which yval or ypredict labeled the artist
    2nd list: predicted value for artist (1 or 0) for all situations in which yval or ypredict labeled the artist
    these 2 lists are necessary to calculate the TP, FP, FN which are needed to calculate the metrics
    """
    artist_pred_dict = {artist: [[int(1) if val == artist else int(0) for val, pred in zip(yval, ypred) if (val == artist or pred == artist)],
                                [int(1) if pred == artist else int(0) for val, pred in zip(yval, ypred) if (val == artist or pred == artist)]]
                                for artist in artist_list}
    return artist_pred_dict

## 2. Create a dataframe with lyrics and corresponding artist

In [62]:
root_dir = '../data/songs3/'
# get the artist names from the directory names
artist_list = os.listdir(root_dir)
# get a 2D list. axis 1: list with Lyrics & artist for each song. axis 0: all songs of all artists
lyrics_list = get_lyrics_list(root_dir, artist_list)
# create a dataframe with artists and lyrics as columns
df_train = create_dataframe(lyrics_list)
# train test split
Xtrain, Xval, ytrain, yval = train_test_split(df_train['lyrics_X'], df_train['artist_y'], test_size=0.1, random_state=42)
# check if target data is balanced
pd.DataFrame(ytrain).value_counts() 

artist_y     
Frank_Sinatra    199
Johnny_Cash      178
Eminem           147
Madonna          137
Bob_Marley       114
The_Kooks         94
Amy_Winehouse     85
dtype: int64

## 3. Create pipelines with gridsearch hyperparameter optimization

In [63]:
# use an tf-idf-vectorizer, an RandomOverSampler (to balance the imbalanced lyric documents) 
# and the Multinomial-Naive-Bayes Classifier to classify the lyrics
pipeline1 = mp_imb(TfidfVectorizer(stop_words='english', ngram_range=(1, 1)), 
                    RandomOverSampler(random_state=42, sampling_strategy={'Amy_Winehouse': 160,
                            'The_Kooks':160, 'Bob_Marley': 160, 'Madonna': 160}),
                    MultinomialNB())
# pipeline1.get_params()

In [64]:
# create an alternative pipeline with a a RandomForestClassifier and a different over-sampling strategy 
pipeline2 = mp_imb(TfidfVectorizer(stop_words='english', ngram_range=(1, 1)), 
                    RandomOverSampler(random_state=42, sampling_strategy={'Amy_Winehouse': 180,
                            'The_Kooks':180, 'Bob_Marley': 180, 'Madonna': 180}),
                    RandomForestClassifier(n_estimators=100, max_depth=15, class_weight='balanced'))
# pipeline2.get_params()


In [65]:
# Hyperparameter optimization for the Multinomial Naive Bayes Classifier
use_code = False
if use_code:
    params1 = {'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)],
                'multinomialnb__alpha': [0.3, 0.4, 0.5, 0.6],
                'randomoversampler__sampling_strategy': [{'Amy_Winehouse': 165, 'The_Kooks': 165, 'Bob_Marley': 165, 'Madonna': 165},
                                                        {'Amy_Winehouse': 130, 'The_Kooks': 130, 'Bob_Marley': 160, 'Madonna': 160}]
                }
    gs = GridSearchCV(pipeline1, params1, cv=6)
    gs.fit(Xtrain, ytrain)
    print(gs.best_params_)
    print(gs.best_score_)
    pd.DataFrame(gs.cv_results_)



In [66]:
# Hyperparameter optimization for the Random Forest Classifier
use_code = False
if use_code:
    params2 = { #'tfidfvectorizer__ngram_range': [(1, 1)],
                'randomforestclassifier__max_depth': [24, 26, 28],
                'randomforestclassifier__n_estimators': [120, 122, 124],
                'randomforestclassifier__class_weight': [None, 'balanced'],
                'randomoversampler__sampling_strategy': [{'Amy_Winehouse': 165, 'The_Kooks': 165, 'Bob_Marley': 165, 'Madonna': 165},
                                                        {'Amy_Winehouse': 130, 'The_Kooks': 130, 'Bob_Marley': 160, 'Madonna': 160}]
                }
    gs2 = GridSearchCV(pipeline2, params2, cv=6)
    gs2.fit(Xtrain, ytrain)
    print(gs2.best_params_)
    print(gs2.best_score_)
    pd.DataFrame(gs2.cv_results_)


In [67]:
# RF-Classifier pipeline with the best results from the prior hyperparameter optimization
pipeline3 = mp_imb(TfidfVectorizer(stop_words='english', ngram_range=(1, 1)), 
                            RandomOverSampler(random_state=42, sampling_strategy={'Amy_Winehouse': 200,
                            'The_Kooks':200, 'Bob_Marley': 200, 'Madonna': 200}),
                            RandomForestClassifier(max_depth=24, n_estimators=120, class_weight='balanced'))
# fit the pipeline and calculate the overall train and validation accuracy
pipeline3.fit(Xtrain,ytrain)
print(round(pipeline3.score(Xtrain, ytrain), 4), round(pipeline3.score(Xval, yval), 4))
# get the necessary lists to calculate TP, FP, FN for each artist (which are needed for the metrics)
yval_pred = pipeline3.predict(Xval)
artist_pred_dict = get_prediction_for_artist(list(yval), yval_pred, artist_list)
# evaluate the model for each artist with precision, recall and F1 score of the validation data
evaluate_model(yval, yval_pred, 'RFC', artist_pred_dict, artist_list)



0.9832 0.6916
RFC: 
accuracy: 0.692
Eminem: precision: 0.923, recall: 0.857, F1 Score: 0.889
The_Kooks: precision: 0.556, recall: 0.625, F1 Score: 0.588
Madonna: precision: 1.0, recall: 0.522, F1 Score: 0.686
Bob_Marley: precision: 0.867, recall: 0.812, F1 Score: 0.839
Amy_Winehouse: precision: 1.0, recall: 0.833, F1 Score: 0.909
Johnny_Cash: precision: 0.647, recall: 0.524, F1 Score: 0.579
Frank_Sinatra: precision: 0.444, recall: 0.842, F1 Score: 0.582


In [68]:
# Multinomial Naive Bayes-Classifier pipeline with the best results from the prior hyperparameter optimization
pipeline4 = mp_imb(TfidfVectorizer(stop_words='english', ngram_range=(1, 1)),  
                            RandomOverSampler(random_state=42, sampling_strategy={'Amy_Winehouse': 180,
                            'The_Kooks':180, 'Bob_Marley': 180, 'Madonna': 180}),
                            MultinomialNB(alpha=0.6))
# fit the pipeline and calculate the overall train and validation accuracy
pipeline4.fit(Xtrain,ytrain)
print(round(pipeline4.score(Xtrain, ytrain), 4), round(pipeline4.score(Xval, yval), 4))
# get the necessary lists to calculate TP, FP, FN for each artist (which are needed for the metrics)
yval_pred = pipeline4.predict(Xval)
artist_pred_dict = get_prediction_for_artist(list(yval), yval_pred, artist_list)
# evaluate the model for each artist with precision, recall and F1 score of the validation data
evaluate_model(yval, yval_pred, 'MNB', artist_pred_dict, artist_list)

0.9706 0.7196
MNB: 
accuracy: 0.72
Eminem: precision: 0.778, recall: 1.0, F1 Score: 0.875
The_Kooks: precision: 0.5, recall: 0.5, F1 Score: 0.5
Madonna: precision: 0.824, recall: 0.609, F1 Score: 0.7
Bob_Marley: precision: 0.923, recall: 0.75, F1 Score: 0.828
Amy_Winehouse: precision: 0.417, recall: 0.833, F1 Score: 0.556
Johnny_Cash: precision: 0.765, recall: 0.619, F1 Score: 0.684
Frank_Sinatra: precision: 0.682, recall: 0.789, F1 Score: 0.732


## 3. Save both models (pipelines) with pickle dump

In [69]:
# train models with full data and save it
pipeline3.fit(df_train['lyrics_X'], df_train['artist_y'])
pipeline4.fit(df_train['lyrics_X'], df_train['artist_y'])
# saving RandomForestClassifier
with open("model_RFC.pickle","wb") as file:
    pickle.dump(pipeline3,file)
# saving NaiveBayesClassifier
with open("model_MNB.pickle","wb") as file:
    pickle.dump(pipeline4,file)
