In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [298]:
TEST_SIZE = 0.2
IMPOSTOR_SUBJECT = 'other'

In [299]:
cmu = pd.read_csv('datasets/cmu/DSL-StrongPasswordData.csv')
drop_columns = ['subject', 'sessionIndex', 'rep']
user_keys = cmu["subject"].drop_duplicates().tolist()

In [300]:
# Divisão dos dados de treino e teste para cada usuário (80:20)
user = 's002'

X1_training, X1_test, y1_training, y1_test = {}, {}, {}, {}

for uk in user_keys:
    X = cmu[cmu['subject'] == uk].drop(columns=drop_columns)
    y = [uk] * len(X)
    X1_training[uk], X1_test[uk], y1_training[uk], y1_test[uk] = train_test_split(X, y, test_size=TEST_SIZE, shuffle=False)

In [301]:
one_class_estimators_map = {}

for uk in user_keys:
    one_class_estimators_map[uk] = RandomForestClassifier().fit(X1_training[uk], y1_training[uk])

In [302]:
user_model_acc_on_genuine_samples_map = {}

for uk in user_keys:
    predictions = one_class_estimators_map[uk].predict(X1_test[uk])
    user_model_acc_on_genuine_samples_map[uk] = accuracy_score(y1_test[uk], predictions)

average_acc = np.average(list(user_model_acc_on_genuine_samples_map.values())) * 100

print(f"Acurácia média dos modelos One-Vs-One: {average_acc}%")

Acurácia média dos modelos One-Vs-One: 100.0%


In [303]:
# Criação de um modelo One-vs-Rest para cada usuário
# Divisão dos dados: (80:20), sendo que em cada conjunto 50% dos dados são do 
# próprio usuário e 50% são registros aleatórios de outros usuários 

X_user_training, X_other_training = {}, {}
y_user_training, y_other_training = {}, {}
X_user_test, X_other_test = {}, {},
y_user_test, y_other_test = {}, {}

for uk in user_keys:
    X_user = cmu[cmu['subject'] == uk].drop(columns=drop_columns)    
    y_user = [uk] * len(X_user)
    X_other = cmu[cmu['subject'] != uk].sample(n=len(X_user))
    y_other = [IMPOSTOR_SUBJECT] * len(X_other)
    X_user_training[uk], X_user_test[uk] = train_test_split(X_user, test_size=TEST_SIZE)
    y_user_training[uk], y_user_test[uk] = train_test_split(y_user, test_size=TEST_SIZE)
    X_other_training[uk], X_other_test[uk] = train_test_split(X_other.drop(columns=drop_columns), test_size=TEST_SIZE)
    y_other_training[uk], y_other_test[uk] = train_test_split(y_other, test_size=TEST_SIZE)

In [304]:
two_class_estimators_map = {}
two_class_acc_map = {}

for uk in user_keys:
    X = X_user_training[uk] + X_other_training[uk]
    y = y_user_training[uk] + y_other_training[uk]
    two_class_estimators_map[uk] = RandomForestClassifier().fit(X, y)

for uk in user_keys:
    X_test = X_user_test[uk] + X_other_test[uk]
    predictions = two_class_estimators_map[uk].predict(X_test)
    y_true = y_user_test[uk] + y_other_test[uk]
    two_class_acc_map[uk] = accuracy_score(y_true, predictions)

print(f"Acurácia média dos modelos One-Vs-Rest: {np.average(list(two_class_acc_map.values()))}")

Acurácia média dos modelos One-Vs-Rest: 0.5
