In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
from datetime import datetime

def log(message):
    print(datetime.now().strftime("%H:%M:%S -"), message)
    
def printnow():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

In [3]:
# Create the dictionary of dataframes for training and testing
section_headers = ['Intro','Verse','Refrain','Pre-Chorus','Chorus','Post-Chorus','Hooks','Riffs/Basslines','Scratches','Sampling','Bridge','Interlude','Skit','Collision','Instrumental','Solo','Ad-lib','Segue','Outro']
header_strip_list = '|'.join(['\[' + header + '\]' for header in section_headers])

def header_to_filename(train, header):
    if train: return 'section_train_test/train_' + header.replace('/', '_').lower() + '.zip'
    else: return 'section_train_test/test_' + header.replace('/', '_').lower() + '.zip'

log('Reading data from files')
train_dfs = {header:pd.read_pickle(header_to_filename(1, header)) for header in section_headers}
test_dfs = {header:pd.read_pickle(header_to_filename(0, header)) for header in section_headers}
log('Done\n')

# For dataframes without samples of each genre, add the empty string as lyrics for all genres
dummy_df = pd.DataFrame(data={'lyrics' : 16*[''], 'genre' : 4*['country', 'hiphop', 'pop', 'rock']})
for header in section_headers:
    if len(train_dfs[header]) < 5: train_dfs[header] = pd.concat([train_dfs[header], dummy_df])
    if len(test_dfs[header]) < 5: test_dfs[header] = pd.concat([test_dfs[header], dummy_df])
    train_dfs[header] = train_dfs[header].reset_index(drop=True)
    test_dfs[header] = test_dfs[header].reset_index(drop=True)


15:20:32 - Reading data from files
15:20:32 - Done



In [4]:
log('Finding the counts of each section type in the corpus')
total_length = 0
for i,header in enumerate(section_headers):
    length = len(train_dfs[header]) + len(test_dfs[header])
    print(i + 1, header + ':', length)
    total_length += len(train_dfs[header]) + len(test_dfs[header])
log(f'Done: total length is {total_length}\n')

log('Finding the percent frequencies of each section type in the corpus')
corpus_weights = {}
for i,header in enumerate(section_headers):
    frequency = (len(train_dfs[header]) + len(test_dfs[header])) / total_length
    print(i + 1, header + ':', frequency)
    corpus_weights[header] = frequency
log('Done\n')


15:20:32 - Finding the counts of each section type in the corpus
1 Intro: 1056
2 Verse: 11248
3 Refrain: 438
4 Pre-Chorus: 4128
5 Chorus: 13602
6 Post-Chorus: 1029
7 Hooks: 32
8 Riffs/Basslines: 32
9 Scratches: 32
10 Sampling: 32
11 Bridge: 3417
12 Interlude: 259
13 Skit: 33
14 Collision: 34
15 Instrumental: 190
16 Solo: 177
17 Ad-lib: 35
18 Segue: 32
19 Outro: 2332
15:20:32 - Done: total length is 38138

15:20:32 - Finding the percent frequencies of each section type in the corpus
1 Intro: 0.027688919188211234
2 Verse: 0.29492894226231053
3 Refrain: 0.011484608526928522
4 Pre-Chorus: 0.10823850228118936
5 Chorus: 0.3566521579526981
6 Post-Chorus: 0.026980963868058105
7 Hooks: 0.0008390581572185222
8 Riffs/Basslines: 0.0008390581572185222
9 Scratches: 0.0008390581572185222
10 Sampling: 0.0008390581572185222
11 Bridge: 0.08959567885049033
12 Interlude: 0.006791126959987414
13 Skit: 0.0008652787246316011
14 Collision: 0.0008914992920446798
15 Instrumental: 0.004981907808484976
16 Solo: 0

In [5]:
# Load nonredundant data
country_df = pd.read_pickle(r'train_test_data/country_data.zip')
country_df['genre'] = 'country'
hiphop_df = pd.read_pickle(r'train_test_data/hiphop_data.zip')
hiphop_df['genre'] = 'hiphop'
pop_df = pd.read_pickle(r'train_test_data/pop_data.zip')
pop_df['genre'] = 'pop'
rock_df = pd.read_pickle(r'train_test_data/rock_data.zip')
rock_df['genre'] = 'rock'
full_df = pd.concat([country_df,hiphop_df, pop_df,rock_df])
full_df.reset_index(inplace=True, drop=True)
full_df

# Combine lyrics into one list to be input for tf-idf vectorizer WITH UNIGRAMS UP TO TRIGRAMS
full_lyrics = [lyrics.lower() for lyrics in full_df['lyrics']]
log(f'Training TF-IDF Vectorizer on all {len(full_lyrics)} lyrics')
tfidf_ngram_vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1,3)).fit(full_lyrics)
tfidf_ngram_features = tfidf_ngram_vectorizer.get_feature_names()
tfidf_ngram_data = tfidf_ngram_vectorizer.transform(full_lyrics)
log('Fitting SVD on all lyrics')
svd_ngram = TruncatedSVD(n_components=500).fit(tfidf_ngram_data)
log('Done\n')


15:20:33 - Training TF-IDF Vectorizer on all 4921 lyrics
15:20:47 - Fitting SVD on all lyrics
15:22:41 - Done



In [6]:
# Create TFIDF up to trigram encoding for nonredundant data
def encode_tfidf_ngram_svd(df): 
    tfidf_ngram_vec = tfidf_ngram_vectorizer.transform(df['lyrics'].values)
    svd_ngram_vec = svd_ngram.transform(tfidf_ngram_vec)
    tfidf_ngram_df = pd.DataFrame(svd_ngram_vec)
    tfidf_ngram_df['y'] = df['genre']
    return tfidf_ngram_df

# Create the training and testing encoded dataframes
log('Encoding the training data')
tfidf_train = {header:encode_tfidf_ngram_svd(train_dfs[header]) for header in section_headers}
log('Done\n')

log('Encoding the testing data')
tfidf_test = {header:encode_tfidf_ngram_svd(test_dfs[header]) for header in section_headers}
log('Done\n')


15:22:41 - Encoding the training data
15:23:07 - Done

15:23:07 - Encoding the testing data
15:23:29 - Done



In [7]:
# Load the string lyrics data to make predictions
log('Loading the raw string data')
string_train = pd.read_pickle(r'train_test_data/train.zip')
string_test = pd.read_pickle(r'train_test_data/test.zip')
log('Done\n')

# splits the given lyrics by section 
def split_by_section(lyrics):
    headers = [word[1:-1] for word in lyrics.split() if word[0] == '[' and word[-1] == ']' and word[1:-1] in section_headers]
    split_sections = re.split(header_strip_list, lyrics)
    ret_sections = []
    for section in split_sections:
        mod_section = section.replace('[END]','').replace('[START]','').strip()
        if not(mod_section in ['', ' ','\n']): ret_sections.append(mod_section)
    return list(zip(headers, ret_sections))

# Turn the raw string data into tuples of section strings and the lyrics of that section
log('Splitting the raw string data')
split_string_train = string_train
split_string_train['lyrics'] = string_train['lyrics'].map(split_by_section)
split_string_test = string_test
split_string_test['lyrics'] = string_test['lyrics'].map(split_by_section)
log('Done\n')


15:23:29 - Loading the raw string data
15:23:29 - Done

15:23:29 - Splitting the raw string data
15:23:30 - Done



In [15]:
# Encode the split string data with TFIDF (up to trigrams) and SVD 
def split_lyrics_encode(split_lyrics):
    sections_array = [section for section,lyrics in split_lyrics]
    lyrics_array = [lyrics + ' ' for section,lyrics in split_lyrics]
    try:
        tfidf_ngram_vec = tfidf_ngram_vectorizer.transform(lyrics_array)
        svd_ngram_vec = svd_ngram.transform(tfidf_ngram_vec)
        assert len(svd_ngram_vec) == len(sections_array)
        assert len(svd_ngram_vec[0]) == 500
        return list(zip(sections_array, svd_ngram_vec))
    except ValueError:
        log('ValueError' + str(len(sections_array)) + '\t' + str([len(lyrics) for lyrics in lyrics_array]))
        return list(zip(sections_array, np.array([[0]*500]*len(split_lyrics))))

# Turn the split string data into tuples of section strings and TFIDF encoded lyrics of that section
log('Splitting the encoding string lyrics of each section')
log('\tTesting set')
split_encoded_test = split_string_test
split_encoded_test['lyrics'] = split_string_test['lyrics'].map(split_lyrics_encode)
log('\tTraining set')
split_encoded_train = split_string_train
split_encoded_train['lyrics'] = split_string_train['lyrics'].map(split_lyrics_encode)
log('Done\n')


15:30:46 - Splitting the encoding string lyrics of each section
15:30:46 - 	Training set
15:38:29 - ValueError0	[]
15:49:33 - ValueError0	[]


KeyboardInterrupt: 

In [None]:
log('Saving split encoded lyrics as pickles')
split_encoded_train.to_pickle('section_train_test/split_encoded_train.zip')
split_encoded_test.to_pickle('section_train_test/split_encoded_test.zip')
log('Done\n')


In [133]:
class SectionSplitClassifier:
    def __init__(self, section_classifiers, weights):
        if section_classifiers:
            try: self.section_classifiers = {key:value for key,value in section_classifiers.items()}
            except AttributeError: raise ValueError('section_classifiers was not a dictionary')
        else: raise ValueError('section_classifiers was None')
        if weights:
            if np.round(np.sum([weight for weight in weights.values()])) == 1:
                try: self.weights = {key:value for key,value in weights.items()}
                except AttributeError: raise ValueError('weights was not a dictionary')
            else: raise ValueError('weights did not sum to 1')
        else: raise ValueError('weights was None')


    def set_section_classifiers(self, section_classifiers):
        if section_classifiers:
            try: self.section_classifiers = {key:value for key,value in section_classifiers.items()}
            except AttributeError: raise ValueError('section_classifiers was not a dictionary')
        else: raise ValueError('section_classifiers was None')


    def set_weights(self, weights):
        if weights:
            if np.round(np.sum([weight for weight in weights.values()])) == 1:
                try: self.weights = {key:value for key,value in weights.items()}
                except AttributeError: raise ValueError('weights was not a dictionary')
            else: raise ValueError('weights did not sum to 1')
        else: raise ValueError('weights was None')


    def fit(self, X, y, section, verbose=0):
        if verbose: print(f'Training {section}...')
        self.section_classifiers[section] = self.section_classifiers[section].fit(X,y)
        if verbose: print(f'Done training {section}')


    # def predict(self, X_array, verbose=1):
    def predict(self, X, verbose=1):
        def predict_section(self, X_section, section):
            def str_to_array(string, weight):
                return float(weight) * np.array([
                    int(string == 'country'), 
                    int(string == 'hiphop'),
                    int(string == 'pop'), 
                    int(string == 'rock')])
            return [str_to_array(pred, self.weights[section]) for pred in self.section_classifiers[section].predict([X_section])]

        def predict_song(self, split_encoded):
            def array_to_str(array):
                return ['country','hiphop','pop','rock'][array.index(max(array))]
            pred = np.array([0,0,0,0])
            for section,encoding in split_encoded:
                pred = np.sum([pred, predict_section(encoding, section)], axis=0)
            return array_to_str(pred)

        preds = [predict_song(self, lyrics) for lyrics in X]
        assert len(preds) == len(X)
        return(preds)

        # weighted_preds = [predict_section(self, X, section) for X,section in zip(X_array,self.section_classifiers.keys())]
        # print(len(weighted_preds), len(weighted_preds[0]), len(weighted_preds[0][0]))
        # summed_preds = np.sum(weighted_preds, axis=0)
        # print(len(self.section_classifiers), len(summed_preds), len(summed_preds[0]))
        # # assert len(weighted_preds) == len(self.section_classifiers)
        # # assert len(weighted_preds[0]) == len(summed_preds)
        # # assert len(weighted_preds[0][0]) == len(summed_preds[0])
        # return weighted_preds, summed_preds



In [96]:
# Instantiate a SectionSplitClassifier and train it on all train_dfs
# param_grid = {'ccp_alpha':[0.01725, 0.0175, 0.01775]}
# section_rf_classifiers = {
#     header:GridSearchCV(RandomForestClassifier(criterion='entropy'), param_grid, cv=5) for header in section_headers
# }
section_rf_classifiers = {
    header:RandomForestClassifier(criterion='entropy', ccp_alpha=0.0175) for header in section_headers
}

section_split_classifier = SectionSplitClassifier(section_rf_classifiers, corpus_weights)

# fit the sub-classifiers in the SectionSplitClassifier
log('Training all section classifiers')
for header in section_headers:
    section_split_classifier.fit(tfidf_train[header].drop(columns=['y']), tfidf_train[header]['y'], header, verbose=1)
log('Done\n')


Training Intro...
Done training Intro
Training Verse...
Done training Verse
Training Refrain...
Done training Refrain
Training Pre-Chorus...
Done training Pre-Chorus
Training Chorus...
Done training Chorus
Training Post-Chorus...
Done training Post-Chorus
Training Hooks...
Done training Hooks
Training Riffs/Basslines...
Done training Riffs/Basslines
Training Scratches...
Done training Scratches
Training Sampling...
Done training Sampling
Training Bridge...
Done training Bridge
Training Interlude...
Done training Interlude
Training Skit...
Done training Skit
Training Collision...
Done training Collision
Training Instrumental...
Done training Instrumental
Training Solo...
Done training Solo
Training Ad-lib...
Done training Ad-lib
Training Segue...
Done training Segue
Training Outro...
Done training Outro


In [99]:
trained_classifiers = section_split_classifier.section_classifiers

In [134]:
section_split_classifier2 = SectionSplitClassifier(trained_classifiers, corpus_weights)
X_train_array = [tfidf_test[header].drop(columns=['y']) for header in section_headers]
string_predictions = section_split_classifier2.predict(X_train_array, verbose=1)

AttributeError: 'DataFrame' object has no attribute 'split'

In [124]:
np.sum([np.array(weighted_predictions[i]).shape[0] for i in range(len(weighted_predictions))])

7661

In [125]:
summed_predictions

   , 0.        , 0.29592213, 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.        , 0.29592213, 0.        ]),
 array([0.        , 0.        , 0.29592213, 0.        ]),
 array([0.        , 0.        , 0.29592213, 0.        ]),
 array([0.        , 0.        , 0.29592213, 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.29592213, 0.        , 0.        ]),
 array([0.        , 0.2959221

In [93]:
train_dfs['Hooks']

Unnamed: 0,lyrics,genre
0,,country
1,,hiphop
2,,pop
3,,rock
4,,country
5,,hiphop
6,,pop
7,,rock


In [94]:
train_dfs['Chorus']

Unnamed: 0,lyrics,genre
0,"\nI'ma break you off, let me be your motivatio...",pop
1,"\nI'ma break you off, let me be your motivatio...",pop
2,"\nI'ma break you off, let me be your motivatio...",pop
3,\nIf everything you see is what you're believi...,hiphop
4,\nIf everything you see is what you're believi...,hiphop
...,...,...
10876,\nI am flesh and I am bone\nI'll rise ting tin...,hiphop
10877,\nI am flesh and I am bone\nI'll rise ting tin...,hiphop
10878,"\nTo the right, to the left\nWe will fight to ...",rock
10879,"\nTo the right, to the left\nWe will fight to ...",rock


In [103]:
arr = [[[0, 1, 2, 3], [4, 5, 6, 7]],
       [[8, 9, 10, 11], [12, 13, 14, 15]],
       [[16, 17, 18, 19], [20, 21, 22, 23]]]

summed = np.sum(arr, axis=0)

print(f'len(arr) = {len(arr)}')
print(f'len(arr)[0] = {len(arr[0])}')
print(f'len(arr)[0][0] = {len(arr[0][0])}')
print(f'len(summed) = {len(summed)}')
print(f'len(summed[0] = {len(summed[0])}')


len(arr) = 3
len(arr)[0] = 2
len(arr)[0][0] = 4
len(summed) = 2
len(summed[0] = 4


In [131]:
np.sum([[0,1,0,0], [0,0,0,0]], axis=0)

array([0, 1, 0, 0])