In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import warnings
warnings.simplefilter("ignore")
import numpy as np
import pandas as pd

In [2]:
X = np.load('X_train_many_features.npy')
y = np.load('y_train_many_features.npy')

In [3]:
# Load and split the data
#iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [4]:
# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LogisticRegression(random_state=42))])

pipe_lr_pca = Pipeline([('scl', StandardScaler()),
('pca', PCA(n_components=2)),
('clf', LogisticRegression(random_state=42))])

pipe_rf = Pipeline([('scl', StandardScaler()),
('clf', RandomForestClassifier(random_state=42))])

pipe_rf_pca = Pipeline([('scl', StandardScaler()),
('pca', PCA(n_components=2)),
('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
('clf', svm.SVC(random_state=42))])

pipe_svm_pca = Pipeline([('scl', StandardScaler()),
('pca', PCA(n_components=2)),
('clf', svm.SVC(random_state=42))])

In [5]:
# Set grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]

In [6]:
grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
		'clf__C': param_range_fl,
		'clf__solver': ['liblinear']}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': param_range,
		'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
		'clf__C': param_range}]

# Construct grid searches
jobs = -1

gs_lr = GridSearchCV(estimator=pipe_lr,
			param_grid=grid_params_lr,
			scoring='accuracy',
			cv=10) 
			
gs_lr_pca = GridSearchCV(estimator=pipe_lr_pca,
			param_grid=grid_params_lr,
			scoring='accuracy',
			cv=10)
			
gs_rf = GridSearchCV(estimator=pipe_rf,
			param_grid=grid_params_rf,
			scoring='accuracy',
			cv=10, 
			n_jobs=jobs)

gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
			param_grid=grid_params_rf,
			scoring='accuracy',
			cv=10, 
			n_jobs=jobs)

gs_svm = GridSearchCV(estimator=pipe_svm,
			param_grid=grid_params_svm,
			scoring='accuracy',
			cv=10,
			n_jobs=jobs)

gs_svm_pca = GridSearchCV(estimator=pipe_svm_pca,
			param_grid=grid_params_svm,
			scoring='accuracy',
			cv=10,
			n_jobs=jobs)

In [7]:
# List of pipelines for ease of iteration
grids = [gs_lr, gs_lr_pca, gs_rf, gs_rf_pca, gs_svm, gs_svm_pca]

In [8]:
# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic Regression', 1: 'Logistic Regression w/PCA', 
		2: 'Random Forest', 3: 'Random Forest w/PCA', 
		4: 'Support Vector Machine', 5: 'Support Vector Machine w/PCA'}

In [10]:
from sklearn.metrics import classification_report
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])	
    # Fit grid search	
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        print(classification_report(y_test, y_pred))
        best_gs = gs
        best_clf = idx
    
print('\nClassifier with best test set accuracy: %s' % (grid_dict[best_clf]))
Metric_df = pd.DataFrame(grid_dict[best_clf])

Performing model optimizations...

Estimator: Logistic Regression
Best params: {'clf__penalty': 'l1', 'clf__C': 1.0, 'clf__solver': 'liblinear'}
Best training accuracy: 0.689
Test set accuracy score for best params: 0.688 
              precision    recall  f1-score   support

       blues       0.57      0.63      0.60       139
   classical       0.90      0.96      0.93       142
     country       0.73      0.65      0.68       156
       disco       0.53      0.51      0.52       137
      hiphop       0.65      0.73      0.69       137
        jazz       0.72      0.72      0.72       126
       metal       0.76      0.88      0.81       126
         pop       0.81      0.83      0.82       142
      reggae       0.65      0.56      0.60       153
        rock       0.52      0.44      0.48       144

   micro avg       0.69      0.69      0.69      1402
   macro avg       0.68      0.69      0.69      1402
weighted avg       0.68      0.69      0.68      1402


Estimator: Logist

ValueError: DataFrame constructor not properly called!

In [None]:
# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

In [None]:
from sklearn.externals import joblib
loaded_model = joblib.load('best_gs_pipeline.pkl')
result = loaded_model.score(X_test,y_test)
from yellowbrick.classifier import ClassificationReport
viz = ClassificationReport(loaded_model)

In [None]:
from yellowbrick.classifier import ConfusionMatrix

In [None]:
viz = ConfusionMatrix(loaded_model, classes=['blues','classical','country','disco','hiphop','jazz','metal','pop','reggae','rock'])

In [None]:
viz.fit(X_train, y_train,cmp='YlGnBu')

In [None]:
viz.score(X_test, y_test)

In [None]:
g = viz.poof() 

In [None]:
!pip install --user yellowbrick


In [None]:
from yellowbrick.classifier import ClassificationReport

In [None]:
import pickle

In [None]:
loaded_model = pickle.load(open('/home/mahidharv/Music/best_gs_pipeline.pkl','rb'))