# MLPredictorNotebook

### Dependencies

In [None]:
import pandas as pd
from modules.one_hot import one_hot
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neighbors

pd.set_option('display.max_columns', 100)

### Load Raw Genres Data

In [None]:
# Genres Dataframes
hiphop = pd.read_csv('GenresDatasetGenerator/Genres/HipHopGenre_AllMods.csv')
rock = pd.read_csv('GenresDatasetGenerator/Genres/RockGenre_AllMods.csv')
classical = pd.read_csv('GenresDatasetGenerator/Genres/ClassicalGenre_AllMods.csv')
electronic = pd.read_csv('GenresDatasetGenerator/Genres/ElectronicGenre_AllMods.csv')
alternative = pd.read_csv('GenresDatasetGenerator/Genres/AlternativeGenre_AllMods.csv')
pop = pd.read_csv('GenresDatasetGenerator/Genres/PopGenre_AllMods.csv')

# Combine genre dataframes into a list
genres = [hiphop, rock, classical, electronic, alternative, pop]

def prepare_data(genres):
    """
    prepare_data() generates necessary data as an input to a model.
    
    Args:
    - genres: a list of dataframes with features for every genre
    
    Returns:
    - samples: an array with input samples to a model
    - target: an array with output target to a model
    - nn_dataframe: input/output dataframe
    - target_dict: Encoded/Decoded target dictionary
    """
    
    # Feature columns used as input to a model if CF
    ready_cols = ['danceability', 'energy', 'speechiness',
                  'acousticness', 'instrumentalness', 'liveness',
                  'valence', 'danceability_energy_ratio', 'danceability_valence_ratio']
    
    # Feature columns used as input to a model
    #ready_cols = ['danceability', 'energy', 'speechiness',
    #              'acousticness', 'instrumentalness', 'liveness','valence']
    nready_cols = ['loudness', 'tempo', 'duration_ms',
                   'key', 'mode', 'time_signature']
    
    # Concatenate all dataframes and reset indexes
    
    #If Track URI in columns
    df = pd.concat(genres, ignore_index=True).set_index('Track URI')
    ready_cols_df = df[ready_cols].reset_index()
    nready_cols_df = df[nready_cols]
    
    # Normalization objects
    scaler = MinMaxScaler()
    encoder = LabelEncoder()
    
    # Normalize features
    nready_cols_scaled = scaler.fit_transform(nready_cols_df.values)
    nready_cols_scaled_df = pd.DataFrame(nready_cols_scaled, columns=nready_cols)
    
    # Input to a model
    
    #If Track URI in columns
    samples_df = pd.concat([ready_cols_df.drop(columns='Track URI'), nready_cols_scaled_df], axis=1)
    samples = samples_df.values
    
    # Target output from a model
    target = encoder.fit_transform(df['genres'].values)
    target_df = pd.DataFrame(target, columns=['genres'])
    
    # Target dictionary
    encoded_target = encoder.inverse_transform(target)
    target_dict = dict(zip(encoded_target, target))
    target_dict = dict(sorted(target_dict.items()))
    
    # Dataframe representation
    nn_dataframe = pd.concat([samples_df, target_df], axis=1)
    
    return samples, target, nn_dataframe, target_dict

# Call the function and assign the returned values
samples, target, nn_dataframe, target_dict = prepare_data(genres)

# ML Algorithms


In [None]:
#Split dataset
X_train, X_test, y_train, y_test = train_test_split(samples, target, test_size = 0.2)

## (1) Decision Trees

### GridSearchCV

In [None]:
def DecisionTreeGridSearch(X_train, y_train):
    #Parameters for GridSearchCV
    parameters = {'criterion': ['gini', 'entropy'],
                  'max_depth': [None, 2, 4, 6, 8],
                  'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8],
                  'splitter': ['best', 'random']}
    #GridSearchCV
    clf = GridSearchCV(estimator = tree.DecisionTreeClassifier(),
                      param_grid = parameters,
                      cv = 5,
                      n_jobs = 5,
                      verbose = 1)
    clf = clf.fit(X_train, y_train)
    print(clf.best_params_)

DecisionTreeGridSearch(X_train, y_train)

### Decision Tree Classifier

In [None]:
def DecisionTreeClassifier(X_train, y_train, X_test, y_test):
    #DecisionTreeClassifier
    clf = tree.DecisionTreeClassifier(criterion = 'gini', 
                                      max_depth = 8, 
                                      max_features = 0.6, 
                                      splitter = 'best')
    clf = clf.fit(X_train, y_train)
    #Prediction
    predictions = clf.predict(X_test)
    #Prediction accuracy
    print(accuracy_score(y_test, predictions))
    #Confusion matrix
    confusion_matrix_result = confusion_matrix(y_test, predictions)
    cm_vis = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_result, 
                                    display_labels = clf.classes_)
    cm_vis.plot()

In [None]:
DecisionTreeClassifier(X_train, y_train, X_test, y_test)

## (2) Random Forests

### GridSearchCV

In [None]:
def RandomForestGridSearch(X_train, y_train):
    #Parameters for GridSearchCV
    parameters = {'max_depth': [None, 2, 4, 6, 8],
                  'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8]
                 }
    #GridSearchCV
    clf = GridSearchCV(estimator = ensemble.RandomForestClassifier(),
                       param_grid = parameters,
                       cv = 5,
                       n_jobs = 5,
                       verbose = 1)
    clf = clf.fit(X_train, y_train)
    print(clf.best_params_)

RandomForestGridSearch(X_train, y_train)

### Random Forest Classifier

In [None]:
def RandomForestClassifier(X_train, y_train, X_test, y_test):
    #RandomForestClassifier
    clf = ensemble.RandomForestClassifier(max_depth = 8, 
                                          max_features = 'sqrt')
    clf = clf.fit(X_train, y_train)
    #Prediction
    predictions = clf.predict(X_test)
    #Prediction accuracy
    print(accuracy_score(y_test, predictions))
    #Confusion matrix
    confusion_matrix_result = confusion_matrix(y_test, predictions)
    cm_vis = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_result, 
                                    display_labels = clf.classes_)
    cm_vis.plot()

In [None]:
RandomForestClassifier(X_train, y_train, X_test, y_test)

## (3) Stochastic Gradient Descent

### GridSearchCV

In [None]:
def SGDGridSearch(X_train, y_train):
    #Parameters for GridSearchCV
    parameters = {'loss': ['hinge', 'log_loss'],
                  'penalty': ['l2', 'l1', 'elasticnet', None],
                  'max_iter': [100, 1000, 5000]}
    #GridSearchCV
    clf = GridSearchCV(estimator = linear_model.SGDClassifier(),
                       param_grid = parameters,
                       cv = 5,
                       n_jobs = 5,
                       verbose = 1)
    clf = clf.fit(X_train, y_train)
    print(clf.best_params_)

SGDGridSearch(X_train, y_train)

### Stochastic Gradient Descent Classifier

In [None]:
def SGDClassifier(X_train, y_train, X_test, y_test):
    #SGDClassifier
    clf = linear_model.SGDClassifier(loss = 'log_loss',
                                     max_iter = 1000,
                                     penalty = 'l1')
    clf = clf.fit(X_train, y_train)
    #Prediction
    predictions = clf.predict(X_test)
    #Prediction accuracy
    print(accuracy_score(y_test, predictions))
    #Confusion matrix
    confusion_matrix_result = confusion_matrix(y_test, predictions)
    cm_vis = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_result, 
                                    display_labels = clf.classes_)
    cm_vis.plot()

In [None]:
SGDClassifier(X_train, y_train, X_test, y_test)

## (4) K-Nearest Neighbors

### GridSearchCV

In [None]:
def KNNGridSearch(X_train, y_train):
    #Parameters for GridSearchCV
    parameters = {'n_neighbors': [2, 5, 10, 20],
                  'weights': ['uniform', 'distance', None],
                  'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
    #GridSearchCV
    clf = GridSearchCV(estimator = neighbors.KNeighborsClassifier(),
                       param_grid = parameters,
                       cv = 5,
                       n_jobs = 5,
                       verbose = 1)
    clf = clf.fit(X_train, y_train)
    print(clf.best_params_)

KNNGridSearch(X_train, y_train)

### K-Nearest Neighbors Classifier

In [None]:
def KNNClassifier(X_train, y_train, X_test, y_test):
    #SGDClassifier
    clf = neighbors.KNeighborsClassifier(n_neighbors = 20,
                                         algorithm = 'ball_tree',
                                         weights = 'uniform')
    clf = clf.fit(X_train, y_train)
    #Prediction
    predictions = clf.predict(X_test)
    #Prediction accuracy
    print(accuracy_score(y_test, predictions))
    #Confusion matrix
    confusion_matrix_result = confusion_matrix(y_test, predictions)
    cm_vis = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_result, 
                                    display_labels = clf.classes_)
    cm_vis.plot()

In [None]:
KNNClassifier(X_train, y_train, X_test, y_test)

# Prediction Results

In [33]:
data = [[0.5728682170542636,0.7710843373493976,0.586046511627907,0.6031007751937985, 0.7335526315789473],[0.6426356589147287,0.8056224899598393,0.662015503875969,0.6573643410852713, 0.7796052631578947],[0.6116279069767442,0.7542168674698795,0.6209302325581395,0.5906976744186047, 0.7006578947368421],[0.5736434108527132,0.742570281124498,0.5891472868217055,0.610077519379845, 0.7072368421052632]]
index = ['Decision Trees', 'Random Forests', 'SGD', 'K-nearest Neighbors']
columns = ['Raw Data', 'Raw Data + PMN', 'Raw Data + CF', 'Raw Data + Trim', 'Raw Data + All Mods']
results = pd.DataFrame(data = data, index = index, columns = columns) * 100
results

Unnamed: 0,Raw Data,Raw Data + PMN,Raw Data + CF,Raw Data + Trim,Raw Data + All Mods
Decision Trees,57.286822,77.108434,58.604651,60.310078,73.355263
Random Forests,64.263566,80.562249,66.20155,65.736434,77.960526
SGD,61.162791,75.421687,62.093023,59.069767,70.065789
K-nearest Neighbors,57.364341,74.257028,58.914729,61.007752,70.723684
