In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline
import time
from sklearn.externals import joblib
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

tcontext = 120

# Load features
features = np.load('features_tcontext_' + str(tcontext) + '_frameSize_1024.npz')
features = features['a']
features = features[:10000,:]
features += np.finfo(np.float32).eps
features = np.delete(features, 60, 1) #deletes a constant column
features = np.delete(features, 60, 1) #deletes a constant column
labels = features[:,-1].astype(int)
features = features[:, :features.shape[1] - 1]

with open("features_length.txt", 'r') as f:
    features_length = [line.rstrip('\n') for line in f]

features_names = list()
for i in features_length:
    marker = i.find(':')
    f_name = i[:marker]
    values = int(i[marker+2:])
    for j in range(values):
        features_names.append(f_name + str('_')+ str(j))
del features_names[60:62] # deletes the feature names which are constant

# Prepare data
imputer = SimpleImputer()
features = imputer.fit_transform(features)
le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels)


# Split data
features_train, features_val, labels_train, labels_val = train_test_split(features, labels, test_size=0.2, random_state = 42)
features_train_df = pd.DataFrame(data = features_train, columns = features_names)
features_val_df = pd.DataFrame(data = features_val, columns = features_names)


def grid_search(clf, X, y):
    k_features = [5, 10, 20, 40, 80, 84]
    params = dict(anova_filter__k=k_features,
                  svm__kernel=['rbf'], svm__C=[0.01, 0.1, 1, 3],
                  svm__degree=[1, 3, 5, 7], svm__gamma=[0.01, 0.1, 1,3])
    gs = GridSearchCV(clf, param_grid=params, cv=10, verbose=2)
    gs.fit(X, y)
    print ("Best estimator:")
    print (gs.best_estimator_)
    print ("Best parameters:")
    print (gs.best_params_)
    print ("Best score:")
    print (gs.best_score_)

    return gs

estimators = [("scale", preprocessing.StandardScaler()),
                  ('anova_filter', SelectKBest(f_classif, k=80)),
                  ('svm', svm.SVC(decision_function_shape='ovo'))]
clf = Pipeline(estimators)
gs = grid_search(clf, features_train, labels_train)


# Load features
features = np.load('features_tcontext_' + str(tcontext) + '_frameSize_1024.npz')
features = features['a']
#features = features[:10000,:]
features += np.finfo(np.float32).eps
features = np.delete(features, 60, 1) #deletes a constant column
features = np.delete(features, 60, 1) #deletes a constant column
labels = features[:,-1].astype(int)
features = features[:, :features.shape[1] - 1]

with open("features_length.txt", 'r') as f:
    features_length = [line.rstrip('\n') for line in f]

features_names = list()
for i in features_length:
    marker = i.find(':')
    f_name = i[:marker]
    values = int(i[marker+2:])
    for j in range(values):
        features_names.append(f_name + str('_')+ str(j))
del features_names[60:62] # deletes the feature names which are constant

# Prepare data
imputer = SimpleImputer()
features = imputer.fit_transform(features)
le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels)


# Split data
features_train, features_val, labels_train, labels_val = train_test_split(features, labels, test_size=0.2, random_state = 42)
features_train_df = pd.DataFrame(data = features_train, columns = features_names)
features_val_df = pd.DataFrame(data = features_val, columns = features_names)

# Select best k features
selector = SelectKBest(f_classif, k=40)
selector.fit(features_train_df, labels_train)
# Get columns to keep
cols = selector.get_support(indices=True)
cols = [features_names[i] for i in cols]

clf_svm = gs.best_estimator_
clf_svm.fit(features_train, labels_train)

model_filename = './tcontext_'+ str(tcontext) + '/models/svm/svm_best_model.sav'
joblib.dump(clf_svm, open(model_filename, 'wb'))