In [135]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [136]:
TEST_SIZE = 0.2
IMPOSTOR_SUBJECT = 'other'

In [137]:
cmu = pd.read_csv('datasets/cmu/DSL-StrongPasswordData.csv')
drop_columns = ['subject', 'sessionIndex', 'rep']
user_keys = set(cmu["subject"].unique())

In [None]:
classifier = RandomForestClassifier()

In [None]:
# Divisão dos dados de treino e teste para cada usuário (80:20)

user_training_vetors_map = {}
user_test_vectors_map = {}

for uk in user_keys:
    user_samples = cmu[cmu['subject'] == uk].drop(columns=drop_columns)
    user_training_vetors_map[uk], user_test_vectors_map[uk] = train_test_split(user_samples, test_size=TEST_SIZE)

In [None]:
one_class_estimators_map = {}

for uk in user_keys:
    training_vecs = user_training_vetors_map[uk]
    target_subjects = [uk] * len(training_vecs)
    one_class_estimators_map[uk] = classifier.fit(training_vecs, target_subjects)

In [140]:
user_model_acc_on_genuine_samples_map = {}

for uk in user_keys:
    test_vecs = user_test_vectors_map[uk]
    target_subjects = [uk] * len(test_vecs)
    predicted_subjects = one_class_estimators_map[uk].predict(test_vecs)
    user_model_acc_on_genuine_samples_map[uk] = accuracy_score(target_subjects, predicted_subjects)

average_acc = np.average(list(user_model_acc_on_genuine_samples_map.values()))

print(f"Acurácia média dos modelos One-Vs-One: {average_acc * 100}%")

Acurácia média dos modelos individuais, para exemplos dos próprios usuários: 100.0%


In [None]:
# Criação de um modelo One-vs-Rest para cada usuário
# Divisão dos dados: (80:20), sendo que em cada conjunto 50% dos dados são do 
# próprio usuário e 50% são registros aleatórios de outros usuários 

training_vectors_map = {}
training_samples_target_subs_map = {}
test_vectors_map = {}
test_samples_target_subs_map = {}

for uk in user_keys:
    user_samples = cmu[cmu['subject'] == uk].drop(columns=drop_columns)
    other_samples = cmu[cmu['subject'] != uk].drop(columns=drop_columns).sample(n=len(user_samples))
    user_training_samples, user_test_samples = train_test_split(user_samples, test_size=TEST_SIZE)
    others_training_samples, others_test_samples = train_test_split(other_samples, test_size=TEST_SIZE)
    training_vectors_map[uk] = user_training_samples + others_training_samples
    training_samples_target_subs_map[uk] = [uk] * len(user_training_samples) + [IMPOSTOR_SUBJECT] * len(others_training_samples)
    test_vectors_map[uk] = user_test_samples + others_test_samples
    test_samples_target_subs_map[uk] = [uk] * len(user_test_samples) + [IMPOSTOR_SUBJECT] * len(others_test_samples)

In [None]:
two_class_estimators_map = {}
two_class_acc_map = {}

for uk in user_keys:
    two_class_estimators_map[uk] = classifier.fit(training_vectors_map[uk], training_samples_target_subs_map[uk])

for uk in user_keys:
    test_samples, test_target_subjects = test_vectors_map[uk], test_samples_target_subs_map[uk]
    predicted_subjects = two_class_estimators_map[uk].predict(test_samples)
    two_class_acc_map[uk] = balanced_accuracy_score(test_target_subjects, predicted_subjects)

print(f"Acurácia média dos modelos One-Vs-Rest: {np.average(list(two_class_acc_map.values()))}")

Acurácia média dos modelos One-Vs-Rest para cada usuário: 0.5
