In [43]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from keras import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

In [44]:
from datetime import datetime

def log(message):
    print(datetime.now().strftime("%H:%M:%S -"), message)
    
def printnow():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

In [45]:
class SectionSplitClassifier:
    def __init__(self, section_classifiers, weights):
        if section_classifiers:
            try: self.section_classifiers = {key:value for key,value in section_classifiers.items()}
            except AttributeError: raise ValueError('section_classifiers was not a dictionary')
        else: raise ValueError('section_classifiers was None')
        if weights:
            if np.sum([weight for weight in weights]) == 1:
                try: self.weights = {key:value for key,value in weights.items()}
                except AttributeError: raise ValueError('weights was not a dictionary')
            else: raise ValueError('weights did not sum to 1')
        else: raise ValueError('weights was None')


    def set_section_classifiers(self, section_classifiers):
        if section_classifiers:
            try: self.section_classifiers = {key:value for key,value in section_classifiers.items()}
            except AttributeError: raise ValueError('section_classifiers was not a dictionary')
        else: raise ValueError('section_classifiers was None')


    def set_weights(self, weights):
        if weights:
            if np.sum([weight for weight in weights]) == 1:
                try: self.weights = {key:value for key,value in weights.items()}
                except AttributeError: raise ValueError('weights was not a dictionary')
            else: raise ValueError('weights did not sum to 1')
        else: raise ValueError('weights was None')


    def fit(self, X, y, section, verbose=0):
        if verbose: print(f'Training {section}...')
        self.section_classifiers[section] = section_classifiers[section].fit(X,y)
        if verbose: print(f'Done training {section}')


    def predict(self, X, verbose=1):
        def str_to_array(string, weight):
            return np.array([
                int(string == 'country'),
                int(string == 'hiphop'),
                int(string == 'pop'),
                int(string == 'rock')]) * weight

        def array_to_str(array):
            i = array.index(max(array))
            return ['country','hiphop','pop','rock'][i]

        def predict_section(self, X, section):
            return [str_to_array(pred, self.weights[section]) for pred in self.section_classifiers[section].predict(X)]

        weighted_preds = [predict_section(self, X, section) for section in section_classifiers.keys()]
        summed_preds = np.sum(weighted_preds, axis=0)
        assert len(weighted_preds) == len(section_classifiers)
        assert len(weighted_preds[0]) == len(summed_preds)
        assert len(weighted_preds[0][0]) == len(summed_preds[0])
        return summed_preds



In [42]:
arr = [[[0, 1, 2, 3], [4, 5, 6, 7]],
       [[8, 9, 10, 11], [12, 13, 14, 15]],
       [[16, 17, 18, 19], [20, 21, 22, 23]]]

summed = np.sum(arr, axis=0)

print(f'len(arr) = {len(arr)}')
print(f'len(arr)[0] = {len(arr[0])}')
print(f'len(arr)[0][0] = {len(arr[0][0])}')
print(f'len(summed) = {len(summed)}')
print(f'len(summed[0] = {len(summed[0])}')


len(arr) = 3
len(arr)[0] = 2
len(arr)[0][0] = 4
len(summed) = 2
len(summed[0] = 4


In [16]:
tuples = [x for x in zip(['country', 'rock', 'hiphop'], ['pop', 'country', 'hiphop'], ['pop', 'country', 'hiphop'])]
print(tuples)
print([x for x in list([1,2,1,3,4,5]).count])

[('country', 'pop', 'pop'), ('rock', 'country', 'country'), ('hiphop', 'hiphop', 'hiphop')]


TypeError: 'builtin_function_or_method' object is not iterable

In [20]:
param_grid = {
    'ccp_alpha':[0.0045, 0.005, 0.0055]
}
classifier = GridSearchCV(DecisionTreeClassifier(criterion='entropy'), param_grid, cv=5)
printnow()
classifier.fit(tfidf_ngram_train.drop(columns=["y"]), tfidf_ngram_train["y"])
printnow()

print(f'Best parameters: {classifier.best_params_}')
print(f'Training set score: {classifier.score(tfidf_ngram_train.drop(columns=["y"]), tfidf_ngram_train["y"])}')
print(f'Test set score: {classifier.score(tfidf_ngram_test.drop(columns=["y"]), tfidf_ngram_test["y"])}')
print(classification_report(tfidf_ngram_test["y"], classifier.predict(tfidf_ngram_test.drop(columns=["y"]))))
print('-' * 20)
print(confusion_matrix(tfidf_ngram_test["y"], classifier.predict(tfidf_ngram_test.drop(columns=["y"]))))

Current Time = 16:30:11
Current Time = 16:31:52
Best parameters: {'ccp_alpha': 0.005}
Training set score: 0.5645325203252033
Test set score: 0.4558375634517767
              precision    recall  f1-score   support

     country       0.64      0.67      0.66       199
      hiphop       0.37      0.64      0.47       255
         pop       0.53      0.51      0.52       239
        rock       0.28      0.10      0.15       292

    accuracy                           0.46       985
   macro avg       0.46      0.48      0.45       985
weighted avg       0.44      0.46      0.43       985

--------------------
[[134  23  38   4]
 [  7 163  29  56]
 [ 64  36 122  17]
 [  3 217  42  30]]


In [21]:
param_grid = {
    'ccp_alpha':[0.0046, 0.00475, 0.0049]
}
classifier = GridSearchCV(ExtraTreesClassifier(criterion='entropy'), param_grid, cv=5)
printnow()
classifier.fit(tfidf_ngram_train.drop(columns=["y"]), tfidf_ngram_train["y"])
printnow()

print(f'Best parameters: {classifier.best_params_}')
print(f'Training set score: {classifier.score(tfidf_ngram_train.drop(columns=["y"]), tfidf_ngram_train["y"])}')
print(f'Test set score: {classifier.score(tfidf_ngram_test.drop(columns=["y"]), tfidf_ngram_test["y"])}')
print(classification_report(tfidf_ngram_test["y"], classifier.predict(tfidf_ngram_test.drop(columns=["y"]))))
print('-' * 20)
print(confusion_matrix(tfidf_ngram_test["y"], classifier.predict(tfidf_ngram_test.drop(columns=["y"]))))

Current Time = 16:31:52
Current Time = 16:33:39
Best parameters: {'ccp_alpha': 0.00475}
Training set score: 0.5965447154471545
Test set score: 0.49746192893401014
              precision    recall  f1-score   support

     country       0.70      0.76      0.73       199
      hiphop       0.38      0.75      0.50       255
         pop       0.67      0.62      0.64       239
        rock       0.00      0.00      0.00       292

    accuracy                           0.50       985
   macro avg       0.44      0.53      0.47       985
weighted avg       0.40      0.50      0.43       985

--------------------
[[152  10  37   0]
 [  5 190  18  42]
 [ 55  36 148   0]
 [  4 269  19   0]]
