# Loads features and finds best hyperparameters for a XGB Classifier

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline
import xgboost as xgb
import time
from sklearn.externals import joblib
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Prevents kernel's death
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'


# Init plot
fig = plt.figure()
ax = plt.subplot(111)

tcontext = 120
best_accuracy = 0
best_lr = 0.01
k_features = [5, 10, 20, 40, 80, 84]
best_k = k_features[0]

# Load features and labels
features = np.load('features_tcontext_' + str(tcontext) + '_frameSize_1024.npz')
features = features['a']
features += np.finfo(np.float32).eps
features = np.delete(features, 60, 1) #deletes a constant column
features = np.delete(features, 60, 1) #deletes a constant column
labels = features[:,-1].astype(int)
features = features[:, :features.shape[1] - 1]

with open("features_length.txt", 'r') as f:
    features_length = [line.rstrip('\n') for line in f]

features_names = list()
for i in features_length:
    marker = i.find(':')
    f_name = i[:marker]
    values = int(i[marker+2:])
    for j in range(values):
        features_names.append(f_name + str('_')+ str(j))
del features_names[60:62] # deletes the feature names which are constant


# Prepare data       
imputer = SimpleImputer()
features = imputer.fit_transform(features)
le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels)

# Split data
features_train, features_val, labels_train, labels_val = train_test_split(features, labels, test_size=0.2, random_state = 42)
features_train_df = pd.DataFrame(data = features_train, columns = features_names)
features_val_df = pd.DataFrame(data = features_val, columns = features_names)
    
for k in k_features:

    # Select best k features
    selector = SelectKBest(f_classif, k=k)
    selector.fit(features_train_df, labels_train)
    # Get columns to keep
    cols = selector.get_support(indices=True)
    cols = [features_names[i] for i in cols]
      
    learning_rates = [0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 1]
    accuracies = list()
    for lr in learning_rates:
        start = time.time()
        # Parameters 
        estimators = [("scale", preprocessing.StandardScaler()),
                      ('anova_filter', SelectKBest(f_classif, k=k)),
                      ('xgb', xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=lr))]

        clf_xgb = Pipeline(estimators)
        # Training
        clf_xgb.fit(features_train, labels_train)
        
        # Validation
        pred_xgb = clf_xgb.predict(features_val)
        
        # Save model
        model_filename = './tcontext_' + str(tcontext) + '/models/xgb/xgb_learningRate_'+ str(lr) + '_kBestFeatures_' + str(k) + '.sav'
        joblib.dump(clf_xgb, open(model_filename, 'wb'))

        end = time.time()
        elapsed_time = end-start

        # Save results in a .txt file
        F = open('./tcontext_'+ str(tcontext) +'/results/xgb/results_learningRate_'+ str(lr) + '_kBestFeatures_' + str(k) +  '.txt','w') 
        F.write('Model trained in: ' + str(int(elapsed_time/60)) + ' minutes and ' + str(round(elapsed_time%60, 2)) + ' seconds \n\n')
        F.write('K Best Features using f_classif: ' + str(k) + ' and a learning rate of: '+ str(lr) +'\n\n')
        F.write('Best features: ' + str(cols) + '\n\n')
        F.write('Confusion matrix: \n\n')
        F.write(str(confusion_matrix(labels_val,pred_xgb)))
        F.write('\n\n')
        F.write('Classification report: \n\n')
        F.write(str(classification_report(labels_val,pred_xgb)))
        F.write('\n\n')
        F.write('Accuracy: ' + str(accuracy_score(labels_val, pred_xgb)))
        F.close()
        
        # Update best_accuracy if improved
        if (accuracy_score(labels_val, pred_xgb) > best_accuracy):
            best_accuracy = accuracy_score(labels_val, pred_xgb)
            best_k = k
            best_lr = lr
        accuracies.append(accuracy_score(labels_val, pred_xgb))
        print('learning rate = ' + str(lr) + ' and ' + str(k) + '_best_features: accuracy =  ' + str(accuracy_score(labels_val, pred_xgb)))
    plt.plot(learning_rates, accuracies, label=str(k)+ '-best features')

# Plot results
legend = ax.legend(bbox_to_anchor=(1, 1))
plt.xlabel('learning rate')
plt.ylabel('accuracy')
ax.grid(True)
plt.savefig('./tcontext_'+ str(tcontext) +'/results/xgb/accuracies.png', bbox_inches='tight')



In [None]:
# Save best results
print('\nlearnintg rate = ' + str(best_lr) + ' and ' + str(best_k) + '_best_features presented the best accuracy: ' + str(best_accuracy))
F_best = open('./tcontext_'+ str(tcontext) +'/results/xgb/best_xgb_model.txt','w')
F_best.write('learning rate = ' + str(best_lr) + ' and ' + str(best_k) + '_best_features presented the best accuracy: ' + str(best_accuracy))
F_best.close()