In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
TEST_SIZE = 0.2
IMPOSTOR_SUBJECT = 'other'

In [3]:
cmu: pd.DataFrame = pd.read_csv('datasets/cmu/DSL-StrongPasswordData.csv')
cmu_training_df = cmu[cmu['sessionIndex'] == 1]
cmu_test_df = cmu[cmu['sessionIndex'] != 1]
drop_columns = ['subject', 'sessionIndex', 'rep']
user_keys: set[str] = set(cmu["subject"].drop_duplicates().tolist())
session_indexes: list[int] = cmu['sessionIndex'].drop_duplicates().tolist()

In [10]:
def svm_acc(labels: list[int]) -> float:
    positives, negatives = 0, 0
    for l in labels:
        if l > 0:
            positives += 1
        else:
            negatives += 1
    return positives / negatives

In [4]:
# Divisão dos dados de treino e teste para cada usuário (80:20)

X_training: dict[str, pd.DataFrame] = {}
X_test: dict[str, pd.DataFrame] = {}
y_training: dict[str, list[str]] = {}
y_test: dict[str, list[str]] = {}

for uk in user_keys:
    X_training[uk] = cmu_training_df[cmu_training_df['subject'] == uk].drop(columns=drop_columns)
    y_training[uk] = [uk] * X_training[uk].shape[0]
    X_test[uk] = cmu_test_df[cmu_test_df['subject'] == uk].drop(columns=drop_columns)
    y_test[uk] = [uk] * X_test[uk].shape[0]

In [8]:
one_class_estimators_map: dict[str, OneClassSVM] = {}

for uk in user_keys:
    one_class_estimators_map[uk] = OneClassSVM().fit(X_training[uk], y_training[uk])

In [11]:
user_model_acc_on_genuine_samples_map: dict[str, float] = {}

for uk in user_keys:
    predictions = one_class_estimators_map[uk].predict(X_test[uk]).flatten().tolist()
    user_model_acc_on_genuine_samples_map[uk] = svm_acc(predictions)

average_acc = np.average(list(user_model_acc_on_genuine_samples_map.values()))

print(f"Acurácia dos modelos One-Vs-One: {average_acc}")

Acurácia dos modelos One-Vs-One: 0.1601536355159654


In [32]:
# Criação de um modelo One-vs-Rest para cada usuário
# Divisão dos dados: (80:20), sendo que em cada conjunto 50% dos dados são do 
# próprio usuário e 50% são registros aleatórios de outros usuários 

X_user_training: dict[str, pd.DataFrame] = {}
X_user_test: dict[str, pd.DataFrame] = {}
y_user_training: dict[str, list[str]] = {}
y_user_test: dict[str, list[str]] = {}
X_other_training: dict[str, pd.DataFrame] = {}
X_other_test: dict[str, pd.DataFrame] = {}
y_other_training: dict[str, list[str]] = {}
y_other_test: dict[str, list[str]] = {}

for uk in user_keys:
    other_keys = user_keys - {uk}
    X_other_training[uk] = pd.DataFrame()
    for k in other_keys:
        X_other_training[uk].add(cmu_training_df[cmu_training_df['subject'] == k][0:6])
    X_user_training[uk] = cmu_training_df[cmu_training_df['subject'] == uk]
    y_other_training[uk] = [IMPOSTOR_SUBJECT] * X_other_training[uk].shape[0]
    y_user_training[uk] = [uk] * X_user_training[uk].shape[0]
    X_user_test[uk] =  cmu_test_df[cmu_test_df['subject'] == uk]
    X_other_test[uk] = cmu_test_df[cmu_test_df['subject'] != uk]
    y_user_test[uk] = [uk] * X_user_test[uk].shape[0]
    y_other_test[uk] = [IMPOSTOR_SUBJECT] * X_other_test[uk].shape[0]

In [33]:
two_class_estimators_map: dict[str, RandomForestClassifier] = {}
two_class_acc_map: dict[str, float] = {}

for uk in user_keys:
    print(X_other_training[uk])
    X = pd.concat([X_user_training[uk], X_other_training[uk]])
    y = y_user_training[uk] + y_other_training[uk]
    print(y)
    two_class_estimators_map[uk] = RandomForestClassifier().fit(X, y)

for uk in user_keys:
    X_test = pd.concat([X_user_test[uk], X_other_test[uk]])
    predictions = two_class_estimators_map[uk].predict(X_test)
    y_true = y_user_test[uk] + y_other_test[uk]
    two_class_acc_map[uk] = accuracy_score(y_true, predictions)

average_acc = np.average(list(two_class_acc_map.values()))

print(f"Acurácia média dos modelos One-Vs-Rest: {average_acc}")

Empty DataFrame
Columns: []
Index: []
['s040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040', 's040']


ValueError: could not convert string to float: 's040'