In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
from datetime import datetime

def log(message):
    print(datetime.now().strftime("%H:%M:%S -"), message)
    
def printnow():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

In [3]:
# Create the dictionary of dataframes for training and testing
section_headers = ['Intro','Verse','Refrain','Pre-Chorus','Chorus','Post-Chorus','Hooks','Riffs/Basslines','Scratches','Sampling','Bridge','Interlude','Skit','Collision','Instrumental','Solo','Ad-lib','Segue','Outro']
header_strip_list = '|'.join(['\[' + header + '\]' for header in section_headers])

def header_to_filename(train, header):
    if train: return 'section_train_test/train_' + header.replace('/', '_').lower() + '.zip'
    else: return 'section_train_test/test_' + header.replace('/', '_').lower() + '.zip'

log('Reading data from files')
train_dfs = {header:pd.read_pickle(header_to_filename(1, header)) for header in section_headers}
test_dfs = {header:pd.read_pickle(header_to_filename(0, header)) for header in section_headers}
log('Done\n')

# For dataframes without samples of each genre, add the empty string as lyrics for all genres
dummy_df = pd.DataFrame(data={'lyrics' : 8*[''], 'genre' : 2*['country', 'hiphop', 'pop', 'rock']})
for header in section_headers:
    if len(train_dfs[header]) < 5: train_dfs[header] = pd.concat([train_dfs[header], dummy_df])
    if len(test_dfs[header]) < 5: test_dfs[header] = pd.concat([test_dfs[header], dummy_df])
    train_dfs[header] = train_dfs[header].reset_index(drop=True)
    test_dfs[header] = test_dfs[header].reset_index(drop=True)


23:28:33 - Reading data from files
23:28:33 - Done



In [4]:
log('Finding the counts of each section type in the corpus')
total_length = 0
for i,header in enumerate(section_headers):
    length = len(train_dfs[header]) + len(test_dfs[header])
    print(i + 1, header + ':', length)
    total_length += len(train_dfs[header]) + len(test_dfs[header])
log(f'Done: total length is {total_length}\n')

log('Finding the percent frequencies of each section type in the corpus')
corpus_weights = {}
for i,header in enumerate(section_headers):
    frequency = (len(train_dfs[header]) + len(test_dfs[header])) / total_length
    print(i + 1, header + ':', frequency)
    corpus_weights[header] = frequency
log('Done\n')


23:28:36 - Finding the counts of each section type in the corpus
1 Intro: 1056
2 Verse: 11248
3 Refrain: 438
4 Pre-Chorus: 4128
5 Chorus: 13602
6 Post-Chorus: 1029
7 Hooks: 16
8 Riffs/Basslines: 16
9 Scratches: 16
10 Sampling: 16
11 Bridge: 3417
12 Interlude: 259
13 Skit: 17
14 Collision: 18
15 Instrumental: 190
16 Solo: 177
17 Ad-lib: 19
18 Segue: 16
19 Outro: 2332
23:28:36 - Done: total length is 38010

23:28:36 - Finding the percent frequencies of each section type in the corpus
1 Intro: 0.027782162588792424
2 Verse: 0.2959221257563799
3 Refrain: 0.011523283346487766
4 Pre-Chorus: 0.10860299921073402
5 Chorus: 0.3578531965272297
6 Post-Chorus: 0.02707182320441989
7 Hooks: 0.0004209418574059458
8 Riffs/Basslines: 0.0004209418574059458
9 Scratches: 0.0004209418574059458
10 Sampling: 0.0004209418574059458
11 Bridge: 0.0898973954222573
12 Interlude: 0.006813996316758748
13 Skit: 0.00044725072349381743
14 Collision: 0.00047355958958168905
15 Instrumental: 0.004998684556695606
16 Solo: 0.

In [49]:
#get one array of sections 
train_arr = [lyric for df in train_dfs.values() for lyric in df['lyrics']]

vectorizer_OH = CountVectorizer(min_df = 3, stop_words = 'english')
vectorizer_OH.fit(train_arr)
vocab_size = len(vectorizer_OH.get_feature_names())

In [79]:
# Create TFIDF up to trigram encoding for nonredundant data
def encode_OH(df): 
    #vocab_length = len(vectorizer_OH.get_feature_names())
    encoding_OH = vectorizer_OH.transform(df['lyrics'].values)
    df_OH = pd.DataFrame(encoding_OH.toarray())
    df_OH['y'] = df['genre']
    return df_OH

# Create the training and testing encoded dataframes
log('Encoding the training data')
train_OH = {header:encode_OH(train_dfs[header]) for header in section_headers}
log('Done\n')

log('Encoding the testing data')
test_OH = {header:encode_OH(test_dfs[header]) for header in section_headers}
log('Done\n')


01:47:02 - Encoding the training data
01:47:04 - Done

01:47:04 - Encoding the testing data
01:47:04 - Done



In [66]:
# Load the string lyrics data to make predictions
log('Loading the raw string data')
string_train = pd.read_pickle(r'train_test_data/train.zip')
string_test = pd.read_pickle(r'train_test_data/test.zip')
log('Done\n')

# splits the given lyrics by section 
def split_by_section(lyrics):
    headers = [word[1:-1] for word in lyrics.split() if word[0] == '[' and word[-1] == ']' and word[1:-1] in section_headers]
    split_sections = re.split(header_strip_list, lyrics)
    ret_sections = []
    for section in split_sections:
        mod_section = section.replace('[END]','').replace('[START]','').strip()
        if not(mod_section in ['', ' ','\n']): ret_sections.append(mod_section)
    return list(zip(headers, ret_sections))

# Turn the raw string data into tuples of section strings and the lyrics of that section
log('Splitting the raw string data')
split_string_train = string_train
split_string_train['lyrics'] = string_train['lyrics'].map(split_by_section)
split_string_test = string_test
split_string_test['lyrics'] = string_test['lyrics'].map(split_by_section)
log('Done\n')


01:36:26 - Loading the raw string data
01:36:26 - Done

01:36:26 - Splitting the raw string data
01:36:26 - Done



In [69]:
# Encode the split string data with TFIDF (up to trigrams) and SVD 
def split_lyrics_encode(split_lyrics):
    sections_array = [section for section,lyrics in split_lyrics]
    lyrics_array = [lyrics + ' ' for section,lyrics in split_lyrics]
    try:
        vec_OH = vectorizer_OH.transform(lyrics_array).toarray()
        assert len(vec_OH) == len(sections_array)
        assert len(vec_OH[0]) == vocab_size
        return list(zip(sections_array, vec_OH))
    except IndexError:
        log('ValueError' + str(len(sections_array)) + '\t' + str([len(lyrics) for lyrics in lyrics_array]))
        return list(zip(sections_array, np.array([[0]*vocab_size]*len(split_lyrics))))

# Turn the split string data into tuples of section strings and TFIDF encoded lyrics of that section
log('Splitting the encoding string lyrics of each section')
log('\tTesting set')
split_encoded_test = split_string_test
split_encoded_test['lyrics'] = split_string_test['lyrics'].map(split_lyrics_encode)
log('\tTraining set')
split_encoded_train = split_string_train
split_encoded_train['lyrics'] = split_string_train['lyrics'].map(split_lyrics_encode)
log('Done\n')

01:36:34 - Splitting the encoding string lyrics of each section
01:36:34 - 	Testing set
01:36:35 - 	Training set
01:36:36 - ValueError0	[]
01:36:36 - ValueError0	[]
01:36:38 - ValueError0	[]
01:36:38 - ValueError0	[]
01:36:38 - Done



In [70]:
log('Saving split encoded lyrics as pickles')
split_encoded_train.to_pickle('section_train_test/OH_encoded_train.zip')
split_encoded_test.to_pickle('section_train_test/OH_encoded_test.zip')
log('Done\n')


01:37:18 - Saving split encoded lyrics as pickles
01:38:07 - Done



In [85]:
log('Reading split encoded lyrics pickles into dataframes')
split_encoded_train = pd.read_pickle('section_train_test/OH_encoded_train.zip')
split_encoded_test = pd.read_pickle('section_train_test/OH_encoded_test.zip')
log('Done\n')


02:50:59 - Reading split encoded lyrics pickles into dataframes


MemoryError: 

In [72]:
class SectionSplitClassifier:
    def __init__(self, section_classifiers, weights):
        if section_classifiers:
            try: self.section_classifiers = {key:value for key,value in section_classifiers.items()}
            except AttributeError: raise ValueError('section_classifiers was not a dictionary')
        else: raise ValueError('section_classifiers was None')
        if weights:
            if np.round(np.sum([weight for weight in weights.values()])) == 1:
                try: self.weights = {key:value for key,value in weights.items()}
                except AttributeError: raise ValueError('weights was not a dictionary')
            else: raise ValueError('weights did not sum to 1')
        else: raise ValueError('weights was None')


    def set_section_classifiers(self, section_classifiers):
        if section_classifiers:
            try: self.section_classifiers = {key:value for key,value in section_classifiers.items()}
            except AttributeError: raise ValueError('section_classifiers was not a dictionary')
        else: raise ValueError('section_classifiers was None')


    def set_weights(self, weights):
        if weights:
            if np.round(np.sum([weight for weight in weights.values()])) == 1:
                try: self.weights = {key:value for key,value in weights.items()}
                except AttributeError: raise ValueError('weights was not a dictionary')
            else: raise ValueError('weights did not sum to 1')
        else: raise ValueError('weights was None')


    def fit(self, X, y, section, verbose=0):
        if verbose: print(f'Training {section}...')
        self.section_classifiers[section] = self.section_classifiers[section].fit(X,y)
        if verbose: print(f'Done training {section}')


    def predict(self, X):
        def predict_section(self, X_section, section):
            def str_to_array(string, weight):
                return float(weight) * np.array([
                    int(string == 'country'), 
                    int(string == 'hiphop'),
                    int(string == 'pop'), 
                    int(string == 'rock')])
            return [str_to_array(pred, self.weights[section]) for pred in self.section_classifiers[section].predict([X_section])]

        def predict_song(self, split_encoded):
            def array_to_str(array):
                return ['country','hiphop','pop','rock'][np.argmax(array)]
            pred = np.array([0,0,0,0])
            for section,encoding in split_encoded:
                pred = np.sum([pred, predict_section(self, encoding, section)], axis=0)
            return array_to_str(pred)

        preds = [predict_song(self, lyrics) for lyrics in X]
        assert len(preds) == len(X)
        return(preds)



In [73]:
# create the different kinds of section classifiers and their respective song classifiers
section_rf_classifiers = {
    header:RandomForestClassifier(criterion='entropy', ccp_alpha=0.0175) for header in section_headers}
section_rf_classifier = SectionSplitClassifier(section_rf_classifiers, corpus_weights)

section_ada_classifiers = {
    header:OneVsRestClassifier(AdaBoostClassifier(), n_jobs=-1) for header in section_headers}
section_ada_classifier = SectionSplitClassifier(section_ada_classifiers, corpus_weights)

section_svm_classifiers = {
    header:OneVsRestClassifier(SVC(kernel="linear", C=0.025), n_jobs=-1) for header in section_headers}
section_svm_classifier = SectionSplitClassifier(section_svm_classifiers, corpus_weights)

section_knn_classifiers = {
    header:OneVsOneClassifier(KNeighborsClassifier(3)) for header in section_headers}
section_knn_classifier = SectionSplitClassifier(section_knn_classifiers, corpus_weights)


In [81]:
# train each kind of song classifier
log('Training Random Forest classifier with all sections')
for header in section_headers:
    section_rf_classifier.fit(train_OH[header].drop(columns=['y']), train_OH[header]['y'], header, verbose=0)
log('Done\n')

log('Training ADA Boost classifier with all sections')
for header in section_headers:
    section_ada_classifier.fit(train_OH[header].drop(columns=['y']), train_OH[header]['y'], header, verbose=0)
log('Done\n')

log('Training Linear SVM classifier with all sections')
for header in section_headers:
    section_svm_classifier.fit(train_OH[header].drop(columns=['y']), train_OH[header]['y'], header, verbose=0)
log('Done\n')

log('Training k-Nearest Neighbors classifier with all sections')
for header in section_headers:
    section_knn_classifier.fit(train_OH[header].drop(columns=['y']), train_OH[header]['y'], header, verbose=0)
log('Done\n')


01:47:21 - Training Random Forest classifier with all sections
01:52:49 - Done

01:52:49 - Training ADA Boost classifier with all sections
02:02:49 - Done

02:02:49 - Training Linear SVM classifier with all sections
02:41:38 - Done

02:41:38 - Training k-Nearest Neighbors classifier with all sections
02:45:09 - Done



In [82]:
# predict the genres of both training and testing sets
log('start')
y_pred_train_rf = section_rf_classifier.predict(split_encoded_train['lyrics'])
y_pred_test_rf = section_rf_classifier.predict(split_encoded_test['lyrics'])
log('rf prediction done')
y_pred_train_ada = section_ada_classifier.predict(split_encoded_train['lyrics'])
y_pred_test_ada = section_ada_classifier.predict(split_encoded_test['lyrics'])
log('ada prediction done')
y_pred_train_svm = section_svm_classifier.predict(split_encoded_train['lyrics'])
y_pred_test_svm = section_svm_classifier.predict(split_encoded_test['lyrics'])
log('svm prediction done')
y_pred_train_knn = section_knn_classifier.predict(split_encoded_train['lyrics'])
y_pred_test_knn = section_knn_classifier.predict(split_encoded_test['lyrics'])
log('knn prediction done')

02:45:49 - start


ValueError: Number of features of the model must match the input. Model n_features is 8642 and input n_features is 500 

In [45]:
def show_results(y_pred_train, y_pred_test):
    print(f'Training Set Accuracy: {accuracy_score(split_encoded_train["genre"], y_pred_train)}')
    print(f'Classification Report Training Set:\n{classification_report(split_encoded_train["genre"], y_pred_train)}')
    print('-' * 20)
    print(f'Confusion Matrix Training Set:\n{confusion_matrix(split_encoded_train["genre"], y_pred_train)}\n')
    
    print(f'Testing Set Accuracy: {accuracy_score(split_encoded_test["genre"], y_pred_test)}')
    print(f'Classification Report Testing Set:\n{classification_report(split_encoded_test["genre"], y_pred_test)}')
    print('-' * 20)
    print(f'Confusion Matrix Testing Set:\n{confusion_matrix(split_encoded_test["genre"], y_pred_test)}\n')
y_pred_trains = [y_pred_train_rf, y_pred_train_ada, y_pred_train_svm, y_pred_train_knn]
y_pred_tests = [y_pred_test_rf, y_pred_test_ada, y_pred_test_svm, y_pred_test_knn]
for train, test in zip(y_pred_trains, y_pred_tests):
    show_results(train, test)

Training Set Accuracy: 0.5111788617886179
Classification Report Training Set:
              precision    recall  f1-score   support

     country       0.72      0.29      0.41       738
      hiphop       0.48      0.85      0.61      1131
         pop       0.51      0.86      0.64       973
        rock       0.64      0.01      0.02      1094

    accuracy                           0.51      3936
   macro avg       0.59      0.50      0.42      3936
weighted avg       0.58      0.51      0.42      3936

--------------------
Confusion Matrix Training Set:
[[211  41 486   0]
 [ 12 958 157   4]
 [ 59  79 834   1]
 [ 11 919 155   9]]

Testing Set Accuracy: 0.46294416243654823
Classification Report Testing Set:
              precision    recall  f1-score   support

     country       0.71      0.24      0.36       199
      hiphop       0.42      0.83      0.56       255
         pop       0.48      0.82      0.60       239
        rock       0.00      0.00      0.00       292

    accu