In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

features = np.load('features_tcontext_120_frameSize_1024.npz')
features += np.finfo(np.float32).eps


features = features[:10000,:]
labels = features[:,-1].astype(int)
features = features[:, :features.shape[1] - 1]

features_train, features_val, labels_train, labels_val = train_test_split(features, labels, test_size=0.2, random_state = 42)


with open("features_length.txt", 'r') as f:
    features_length = [line.rstrip('\n') for line in f]

features_names = list()
for i in features_length:
    marker = i.find(':')
    f_name = i[:marker]
    values = int(i[marker+2:])
    for j in range(values):
        features_names.append(f_name + str('_')+ str(j))
        

features_train_df = pd.DataFrame(data = features_train, columns = features_names)
features_val_df = pd.DataFrame(data = features_val, columns = features_names)
k_best = [1, 3, 5, 10, 20, 40, 80, 86]
for k in k_best: 



    anova_filter = SelectKBest(f_classif, k=k)
    clf = svm.SVC(gamma='scale')
    selector = SelectKBest(f_classif, k=k)



    selector.fit(features_train_df, labels_train)
    # Get columns to keep
    cols = selector.get_support(indices=True)
    cols = [features_names[i] for i in cols]


    # Create new dataframe with only desired columns, or overwrite existing
    features_train_subset_df = features_train_df[cols]
    features_val_subset_df = features_val_df[cols]


    anova_svm = make_pipeline(anova_filter, clf)
    anova_svm.fit(features_train_subset_df, labels_train)


    #clf = svm.SVC(gamma='scale', decision_function_shape='ovo')
    #clf.fit(features_train, labels_train) 

    #pred = clf.predict(features_val)  

    pred = anova_svm.predict(features_val_subset_df)  


    #print('Confusion matrix: ')
    #print(confusion_matrix(labels_val,pred))
    #print('')
    #print('Classification report: ')
    #print('\t' + classification_report(labels_val,pred))
    #print('')
    #print('Accuracy: ' + str(accuracy_score(labels_val, pred)))


    F = open('./tcontext_1024/test/test_results_kBestFeatures_' + str(k) +  '.txt','w') 
    F.write('K Best Features using f_classif: ' + str(k) + '\n\n')
    F.write('Confusion matrix: \n\n')
    F.write(str(confusion_matrix(labels_val,pred)))
    F.write('\n\n')
    F.write('Classification report: \n\n')
    F.write(str(classification_report(labels_val,pred)))
    F.write('\n\n')
    F.write('Accuracy: ' + str(accuracy_score(labels_val, pred)))
    F.close()
    
    print('Clf with ' + str(k) + ' best features trained. Accuracy: ' + str(accuracy_score(labels_val, pred)))




Clf with 1 best features trained. Accuracy: 0.4845
Clf with 3 best features trained. Accuracy: 0.7
Clf with 5 best features trained. Accuracy: 0.835
Clf with 10 best features trained. Accuracy: 0.907
Clf with 20 best features trained. Accuracy: 0.572
Clf with 40 best features trained. Accuracy: 0.5735
Clf with 80 best features trained. Accuracy: 0.574
Clf with 86 best features trained. Accuracy: 0.574
