In [81]:
import pickle
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from scipy import stats

# Carga de datos

In [2]:
def read_file(filename):
    x = pickle._Unpickler(open(filename, 'rb'))
    x.encoding = 'latin1'
    p = x.load()
    return p

In [7]:
files = []
for n in range(16, 20):
    s = ''
    if n < 10:
        s += '0'
    s += str(n)
    files.append(s)
print(files)

['16', '17', '18', '19']


# Separacion en data y labels

In [111]:
labels = []
data = []

for i in files:
    filename = "s" + i + ".dat"
    trial = read_file(filename)
    labels.append(trial['labels'])
    data.append(trial['data'])
    
    
#Re-shape arrays into desired shapes
labels = np.array(labels)
labels = labels.flatten()
labels = labels.reshape(160, 4)

data = np.array(data)
data = data.flatten()
data = data.reshape(160, 40, 8064)

print("Labels: ", labels.shape) #trial x label
print("Data: ", data.shape) #trial x channel x data

Labels:  (160, 4)
Data:  (160, 40, 8064)


# Extraccion de valencia y arousal

In [112]:
#Se extraen valencia y arousal
df_label_ratings = pd.DataFrame({'Valence': labels[:,0], 'Arousal': labels[:,1]})
print(df_label_ratings.describe())
df_label_ratings

          Valence     Arousal
count  160.000000  160.000000
mean     5.056625    5.209687
std      1.641227    1.603203
min      1.000000    1.000000
25%      3.867500    3.895000
50%      5.040000    5.615000
75%      6.352500    6.265000
max      7.790000    8.620000


Unnamed: 0,Valence,Arousal
0,2.51,5.65
1,6.45,5.03
2,6.74,5.83
3,7.09,5.67
4,5.51,7.90
...,...,...
155,3.71,5.95
156,3.15,5.53
157,1.62,7.58
158,5.04,5.05


# One Hot Encoding

In [113]:
# Funcion para checkear si valencia es positivo o negativo
def valencia_positiva(trial):
    return 1 if labels[trial,0] >= np.median(labels[:,0]) else 0 
# Funcion para checkear si cada trial tiene alto o bajo arousal
def arousal_alto(trial):
    return 1 if labels[trial,1] >= np.median(labels[:,1]) else 0

In [114]:
labels_encoded = []
for i in range (len(labels)):
    labels_encoded.append([valencia_positiva(i), arousal_alto(i)])
labels_encoded = np.reshape(labels_encoded, (160, 2))
df_labels = pd.DataFrame(data=labels_encoded, columns=["Valencia Positiva", "Arousal Alto"])
print(df_labels.describe())
df_labels

       Valencia Positiva  Arousal Alto
count         160.000000     160.00000
mean            0.506250       0.50000
std             0.501531       0.50157
min             0.000000       0.00000
25%             0.000000       0.00000
50%             1.000000       0.50000
75%             1.000000       1.00000
max             1.000000       1.00000


Unnamed: 0,Valencia Positiva,Arousal Alto
0,0,1
1,1,0
2,1,1
3,1,1
4,1,1
...,...,...
155,0,1
156,0,0
157,0,1
158,1,0


In [115]:
# Dataset solamente con la columna valencia
df_valencia = df_labels['Valencia Positiva']
# Dataset solamente con la columna arousal
df_arousal = df_labels['Arousal Alto']

# 32 Canales EEG

In [116]:
eeg_canales = np.array(["Fp1", "AF3", "F3", "F7", "FC5", "FC1", "C3", "T7", "CP5", "CP1", "P3", "P7", "PO3", "O1", "Oz", "Pz", "Fp2", "AF4", "Fz", "F4", "F8", "FC6", "FC2", "Cz", "C4", "T8", "CP6", "CP2", "P4", "P8", "PO4", "O2"])
eeg_data = []
for i in range (len(data)):
    for j in range (len(eeg_canales)):
        eeg_data.append(data[i,j])
eeg_data = np.reshape(eeg_data, (len(data), len(eeg_canales), len(data[0,0])))
print(eeg_data.shape)

(160, 32, 8064)


# Extraccion de caracteristicas

In [117]:
eeg_caracteristicas = []
for i in range(len(data)):
    sample_features = []
    for j in range(len(eeg_canales)):
        channel_data = data[i,j]
        channel_mean = np.mean(channel_data)
        channel_median = np.median(channel_data)
        channel_variance = np.var(channel_data)
        channel_kurtosis = stats.kurtosis(channel_data)
        sample_features.extend([channel_mean, channel_median, channel_variance, channel_kurtosis])
    eeg_caracteristicas.append(sample_features)
eeg_caracteristicas = np.array(eeg_caracteristicas)

# Separacion en training y test, aplicacion de modelos

In [118]:
# Dividir la data en training y test
def split_train_test(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
    return x_train, x_test, y_train, y_test

In [119]:
# Escalado de caracteristicas
# Ajusta los datos de entrada para que esten en un rango comun
# Evita que características sean dominantes en el modelo solo porque tienen valores de magnitud más grandes
def feature_scaling(train, test):
    sc = StandardScaler()
    train = sc.fit_transform(train)
    test = sc.transform(test)
    return train, test

In [120]:
clf_svm = SVC(kernel = 'linear', random_state = 42, probability=True)
clf_knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto')

In [121]:
models = []
models.append(('SVM', clf_svm))
models.append(('k-NN', clf_knn))

In [122]:
import time
def cross_validate_clf(df_x, df_y, scoring):
    # Train-test split 
    x_train, x_test, y_train, y_test = split_train_test(df_x, df_y)
    # Escalado de caracteristicas
    x_train, x_test = feature_scaling(x_train, x_test)
    
    """
    model = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto') # crear el modelo SVM
    model.fit(x_train, y_train) # entrenar el modelo con los datos de entrenamiento
    y_pred = model.predict(x_test) # hacer predicciones para los datos de prueba
    
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("Precision: ", metrics.accuracy_score(y_test,y_pred))
    """

    names = []
    means = []
    stds = []
    times = []

    for name, model in models:
        start_time = time.time()
        #Se realiza validacion cruzada con 5 splits, se divide los datos en 5 partes y se entrena el modelo 5 veces
        kfold = model_selection.KFold(n_splits=5)
        #Calcula la puntuacion de la validacion cruzada
        cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
        t = (time.time() - start_time)
        times.append(t)
        means.append(cv_results.mean())
        stds.append(cv_results.std())
        names.append(name)
    return names, means, stds, times

In [123]:
cross_validate_clf(eeg_caracteristicas, df_arousal, 'accuracy')

#Nombre modelo
#Precision media de los 5 splits
#Desv est.
#Tiempo de ejecucion de la validacion cruzada

(['SVM', 'k-NN'],
 [0.5098814229249011, 0.5711462450592885],
 [0.06652773934362366, 0.07381892509045852],
 [0.0675654411315918, 0.007006645202636719])

In [124]:
cross_validate_clf(eeg_caracteristicas, df_valencia, 'accuracy')

(['SVM', 'k-NN'],
 [0.5438735177865612, 0.6146245059288538],
 [0.12042953531936247, 0.13566009724688152],
 [0.053049325942993164, 0.0050051212310791016])