In [161]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [162]:
TEST_SIZE = 0.2
IMPOSTOR_SUBJECT = 'other'

In [163]:
cmu: pd.DataFrame = pd.read_csv('datasets/cmu/DSL-StrongPasswordData.csv')
drop_columns = ['subject', 'sessionIndex', 'rep']
user_keys: list[str] = cmu["subject"].drop_duplicates().tolist()

In [164]:
# Divisão dos dados de treino e teste para cada usuário (80:20)

X1_training: dict[str, pd.DataFrame] = {}
X1_test: dict[str, pd.DataFrame] = {}
y1_training: dict[str, list[str]] = {}
y1_test: dict[str, list[str]] = {}

for uk in user_keys:
    X = cmu[cmu['subject'] == uk].drop(columns=drop_columns)
    y = [uk] * X.shape[0]
    X1_training[uk], X1_test[uk], y1_training[uk], y1_test[uk] = train_test_split(X, y, test_size=TEST_SIZE, shuffle=False)

In [165]:
one_class_estimators_map: dict[str, RandomForestClassifier] = {}

for uk in user_keys:
    one_class_estimators_map[uk] = RandomForestClassifier().fit(X1_training[uk], y1_training[uk])

In [166]:
user_model_acc_on_genuine_samples_map: dict[str, float] = {}

for uk in user_keys:
    predictions = one_class_estimators_map[uk].predict(X1_test[uk])
    user_model_acc_on_genuine_samples_map[uk] = accuracy_score(y1_test[uk], predictions)

average_acc = np.average(list(user_model_acc_on_genuine_samples_map.values())) * 100

print(f"Acurácia média dos modelos One-Vs-One: {average_acc}%")

Acurácia média dos modelos One-Vs-One: 100.0%


In [167]:
# Criação de um modelo One-vs-Rest para cada usuário
# Divisão dos dados: (80:20), sendo que em cada conjunto 50% dos dados são do 
# próprio usuário e 50% são registros aleatórios de outros usuários 

X_user_training: dict[str, pd.DataFrame] = {}
X_user_test: dict[str, pd.DataFrame] = {}
y_user_training: dict[str, list[str]] = {}
y_user_test: dict[str, list[str]] = {}
X_other_training: dict[str, pd.DataFrame] = {}
X_other_test: dict[str, pd.DataFrame] = {}
y_other_training: dict[str, list[str]] = {}
y_other_test: dict[str, list[str]] = {}

for uk in user_keys:
    user_rows = cmu[cmu['subject'] == uk]
    X_user = user_rows.drop(columns=drop_columns) 
    y_user = [uk] * X_user.shape[0]
    other_rows = cmu[cmu['subject'] != uk].sample(n=X_user.shape[0])
    X_other = other_rows.drop(columns=drop_columns)
    y_other = [IMPOSTOR_SUBJECT] * X_other.shape[0]
    X_user_training[uk], X_user_test[uk] = train_test_split(X_user, test_size=TEST_SIZE, shuffle=False)
    y_user_training[uk], y_user_test[uk] = train_test_split(y_user, test_size=TEST_SIZE, shuffle=False)
    X_other_training[uk], X_other_test[uk] = train_test_split(X_other, test_size=TEST_SIZE, shuffle=False)
    y_other_training[uk], y_other_test[uk] = train_test_split(y_other, test_size=TEST_SIZE, shuffle=False)

In [168]:
two_class_estimators_map: dict[str, RandomForestClassifier] = {}
two_class_acc_map: dict[str, float] = {}

for uk in user_keys:
    X = pd.concat([X_user_training[uk],X_other_training[uk]])
    y = y_user_training[uk] + y_other_training[uk]
    two_class_estimators_map[uk] = RandomForestClassifier().fit(X, y)

for uk in user_keys:
    X_test = pd.concat([X_user_test[uk], X_other_test[uk]])
    predictions = two_class_estimators_map[uk].predict(X_test)
    y_true = y_user_test[uk] + y_other_test[uk]
    two_class_acc_map[uk] = accuracy_score(y_true, predictions)

average_acc = np.average(list(two_class_acc_map.values())) * 100

print(f"Acurácia média dos modelos One-Vs-Rest: {average_acc}%")

Acurácia média dos modelos One-Vs-Rest: 95.56372549019608%
