# Load in extracted features

In [4]:
# open the every file in a directory and print the first line of each file
import os
import sys
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from scipy.stats import randint


In [5]:
# load in the features
features_train = np.array([])

for file in os.listdir('features_train'):
    with open('features_train/' + file) as f:
        df = pd.read_csv(f)
        features_train = np.column_stack((features_train, df.values)) if features_train.size else df.values

# load the labels
train = pd.read_csv('train.csv')
y_train = train['Genre']
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
# normalize the data
features_train = preprocessing.scale(features_train)
features_used = 65
# take some of the features (used for testing purposes)
features_train = features_train[:, :features_used]
print(features_train.shape)


(800, 65)


# Load in test Data


In [6]:
features_test = np.array([])
# load in the test features
for file in os.listdir('features_test'):
    with open('features_test/' + file) as f:
        df = pd.read_csv(f)
        features_test = np.column_stack((features_test, df.values)) if features_test.size else df.values

# scale the test data
features_test = preprocessing.scale(features_test)

# 
features_test = features_test[:, :features_used]


# Logistic Regression

In [7]:
logreg = LogisticRegression(max_iter=300)
logreg.fit(features_train, y_train)
predictions = logreg.predict(features_train)
# Calculate the accuracy
accuracy = accuracy_score(y_train, predictions)
print("Logistic Regression accuracy:", accuracy)

Logistic Regression accuracy: 0.885


# Predict with Logistic Regression and save to CSV

In [3]:

# run the model on the test data
predictions = logreg.predict(features_test)
predictions = label_encoder.inverse_transform(predictions)
# save the predictions to a csv file
df = pd.DataFrame({'ID': os.listdir('test'), 'genre': predictions})
df.to_csv('predictionsLR.csv', index=False)

NameError: name 'logreg' is not defined

# Cross-Validation for Logistic Regression

In [None]:
# Cross-Validation using the logreg
logreg = LogisticRegression(max_iter=300, penalty='l2', C = 1)
cv_results = cross_validate(logreg, features_train, y_train, cv=10) 
print(cv_results['test_score'], np.mean(cv_results['test_score']))

NameError: name 'LogisticRegression' is not defined

# Neural Network

In [None]:
model = Sequential()
model.add(Dense(80, activation='relu', input_shape = (features_used,), kernel_regularizer='l2', bias_regularizer='l2', activity_regularizer='l2'))

model.add(Dense(256, activation='relu'))

model.add(Dense(128, activation='relu'))

model.add(Dense(60, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(features_train, y_train, epochs=  100, batch_size=28, validation_split=0.2, verbose=2, shuffle=True, steps_per_epoch=10, validation_steps=10, validation_batch_size=28, validation_freq=1)

loss, accuracy = model.evaluate(features_train, y_train)
print('Neural Network accuracy:', accuracy)

Epoch 1/100
10/10 - 1s - loss: 4.6206 - accuracy: 0.1607 - val_loss: 3.8211 - val_accuracy: 0.3500 - 625ms/epoch - 63ms/step
Epoch 2/100
10/10 - 0s - loss: 3.4798 - accuracy: 0.3179 - 31ms/epoch - 3ms/step
Epoch 3/100
10/10 - 0s - loss: 2.5695 - accuracy: 0.4783 - 32ms/epoch - 3ms/step
Epoch 4/100
10/10 - 0s - loss: 2.3956 - accuracy: 0.4857 - 31ms/epoch - 3ms/step
Epoch 5/100
10/10 - 0s - loss: 2.1019 - accuracy: 0.6051 - 30ms/epoch - 3ms/step
Epoch 6/100
10/10 - 0s - loss: 1.9869 - accuracy: 0.5929 - 31ms/epoch - 3ms/step
Epoch 7/100
10/10 - 0s - loss: 1.8549 - accuracy: 0.6196 - 33ms/epoch - 3ms/step
Epoch 8/100
10/10 - 0s - loss: 1.6663 - accuracy: 0.6964 - 36ms/epoch - 4ms/step
Epoch 9/100
10/10 - 0s - loss: 1.6500 - accuracy: 0.6929 - 32ms/epoch - 3ms/step
Epoch 10/100
10/10 - 0s - loss: 1.5225 - accuracy: 0.7210 - 33ms/epoch - 3ms/step
Epoch 11/100
10/10 - 0s - loss: 1.4217 - accuracy: 0.7571 - 31ms/epoch - 3ms/step
Epoch 12/100
10/10 - 0s - loss: 1.3498 - accuracy: 0.7717 - 29m

# Predict with Neural Network and save to CSV

In [None]:
predictions = model.predict(features_test)  
predictions = np.argmax(predictions, axis=1)
predictions = label_encoder.inverse_transform(predictions)
# print(predictions)


df = pd.DataFrame({'ID': os.listdir('test'), 'genre': predictions})
df.to_csv('predictionsNN.csv', index=False)



# Cross-Validation for Neural-Network

In [None]:
kfold = KFold(n_splits = 10)
validation = []
for train, test in kfold.split(features_train, y_train):
    model = Sequential()
    model.add(Dense(80, activation='relu', input_shape = (features_used,), kernel_regularizer='l2', bias_regularizer='l2', activity_regularizer='l2'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(60, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(features_train[train], y_train[train], epochs=25, batch_size=28, validation_split=0.2, verbose=0, shuffle=True, steps_per_epoch=10, validation_steps=10, validation_batch_size=28, validation_freq=1)
    score = model.evaluate(features_train[test], y_train[test], verbose=0)
    validation.append(score[1])
    print(score[1])
print(np.mean(validation))


0.675000011920929
0.7124999761581421
0.6625000238418579
0.737500011920929
0.675000011920929
0.75
0.625
0.7250000238418579
0.7124999761581421
0.7124999761581421
0.6987500011920929


# Random Forest

In [None]:
# use RandomForestClassifier and RandomizedSearchCV

# initialize the parameters which we want to optimize
params = {'n_estimators': randint(50, 200), 'max_depth': randint(1, 20)}
# print(randint(50, 200))
# Initialize Random Forest Classifier
rf = RandomForestClassifier()
# Initialize randomized search CV, which finds the best values for the given inputs
param_search = RandomizedSearchCV(rf, params)
param_search.fit(features_train, y_train)
best = param_search.best_estimator_
best_params = param_search.best_params_
# print(best_params)
# print(param_search.cv_results_)
# Get the best parameter values
max_depth = best_params['max_depth']
n_estimators = best_params['n_estimators']
# Create a random forest using these inputs
rf_best = RandomForestClassifier(n_estimators, max_depth=max_depth)
rf_best.fit(features_train, y_train)
# Predict using this classifier and then check our accuracy
predictions = rf_best.predict(features_train)
# Calculate the accuracy
accuracy = accuracy_score(y_train, predictions)
print(accuracy)

0.99875


# Random Forest Cross-Validation

In [None]:
# Random Forest Cross-Validation
rf = RandomForestClassifier(n_estimators, max_depth=max_depth)
cv_results = cross_validate(rf, features_train, y_train, cv=10) 
print(cv_results['test_score'], np.mean(cv_results['test_score']))


[0.6875 0.5625 0.7375 0.75   0.75   0.7375 0.7    0.675  0.725  0.75  ] 0.7074999999999999


In [None]:
predictions = rf_best.predict(features_test)
predictions = label_encoder.inverse_transform(predictions)
# save the predictions to a csv file
df = pd.DataFrame({'ID': os.listdir('test'), 'genre': predictions})
df.to_csv('predictionsRF.csv', index=False)