In [31]:
## Welcome to Week 5 ##
# Today we are going to try some validation

##Imports##
__author__ = 'bdyetton'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [32]:
#Load and prepare data
all_charts = pd.read_csv('BillboardLyricData.txt', sep='\t', encoding='utf-8')
all_charts = all_charts.dropna() ## Remove missing data

class_mapping = {label:idx for idx,label in enumerate(np.unique(all_charts.chart))}
y_raw = all_charts.chart.map(class_mapping)

vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=1000,stop_words='english')                              
vectorizer.fit(all_charts.lyrics) #Creates the dictionary to convert lyrics to counts
word_vector = vectorizer.transform(all_charts.lyrics) #Does actual conversion
X_raw = word_vector.todense()

In [33]:
#Split of test set, and set aside (dont touch until after we have found the best hyperparameters)
X_train_and_valid, X_test, y_train_and_valid, y_test = train_test_split(X_raw, y_raw, test_size=0.3)

In [34]:
#Split of validation and train set from the remaining
X_train, X_valid, y_train, y_valid = train_test_split(X_train_and_valid, y_train_and_valid, test_size=0.3)

In [35]:
# As a teaching excercise, we will optimize each hyperparameter in turn (this is not guarenteed to return the best hyperparams).
# To keep things simple we will just optimize over 2 hyperparameters, n_iter, and L1 regularization

# YOUR TASK: K-fold cross validate (w. 5 folds) this whole block to find the best hyperparameters 

#Training and Test wrt. n_iterations:
n_iters = np.logspace(1,3,num=3)
train_acc = []
test_acc = []
for n_iter in n_iters:
    # sklearn implementation
    model_2 = SGDClassifier(loss='log', n_iter=n_iter, penalty='none')
    model_2.fit(X_train, y_train)
    train_acc.append(model_2.score(X_train, y_train))
    test_acc.append(model_2.score(X_test, y_test))
    
best_n_iters = n_iters[test_acc.index(max(test_acc))]
print('best n_iters:',best_n_iters)

#Training and Test wrt. Learning Rate:
#Alpha is how much l1/l2 regularization penalty we want to apply
alphas = np.logspace(-5, -3 , num=3)
train_acc = []
test_acc = []
for alpha in alphas:
    # sklearn implementation
    model_2 = SGDClassifier(loss='log', n_iter=best_n_iters, penalty='l1',alpha=alpha)
    model_2.fit(X_train, y_train)
    train_acc.append(model_2.score(X_train, y_train))
    test_acc.append(model_2.score(X_test, y_test))
    
best_alpha = alphas[test_acc.index(max(test_acc))]
print('best alpha:',best_alpha)

best n_iters: 1000.0
best alpha: 0.01


In [36]:
# Now that we have our best hyperparameters, lets train those on the train+validation dataset, and get an accuracy on the test...
model_2 = SGDClassifier(loss='log', n_iter=best_n_iters, penalty='l1',alpha=best_alpha)
model_2.fit(X_train_and_valid, y_train_and_valid)
print('Final Train acc that you would quote in a paper:',model_2.score(X_train_and_valid, y_train_and_valid))
print('Final Test acc that you would quote in a paper:',model_2.score(X_test, y_test))

Final Train acc that you would quote in a paper: 0.473162675475
Final Test acc that you would quote in a paper: 0.394990366089


In [None]:
# Ok, so the above is there to teach you the basics behind tuning hyper parameters, 
# however, these parameters are not independent, and there is some distribution over them that gives the best performance
# We need to optimize over all the possible combinations of hyper parameters. 
# Because the parameter space is relativly small (3 n_iters, 3 alphas, 3*3=9 possible combinations), 
# we can use a brute force method to search over all call gridsearch (this still takes a long long time!).

param_grid = [{'alpha': alphas, 'n_iter': n_iters,}] #Define the parameters to search over, all possible combinations of params will be tested
log_regression_classifier = SGDClassifier(loss='log', penalty='l1') #Use a log regression classifier with l1 regularization
hyper_param_grid_searcher = GridSearchCV(log_regression_classifier, param_grid,cv=3) #define grid search object (w. 3 cross folds to speed things up)
hyper_param_grid_searcher.fit(X_train_and_valid,y_train_and_valid) #perform search over parameters
#print('Best Fitting Params:',hyper_param_grid_searcher.get_params())
print('Final Train acc that you would quote in a paper:',hyper_param_grid_searcher.score(X_train_and_valid, y_train_and_valid))
print('Final Test acc that you would quote in a paper:',hyper_param_grid_searcher.score(X_test, y_test))
