## Classificação

### Importar dados de treino

In [5]:
import gzip
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix

def load_texts_from_file(file_path, label):
    texts = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            texts.append(line.strip())
            labels.append(label)
    return texts, labels

ad_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\train_ad.txt"
cn_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\train_cn.txt"

x_ad, y_ad = load_texts_from_file(ad_file, 1)
x_cn, y_cn = load_texts_from_file(cn_file, 0)

x_train = x_ad + x_cn
y_train = y_ad + y_cn

def ncd(x, x2):
    x_compressed = len(gzip.compress(x.encode()))
    x2_compressed = len(gzip.compress(x2.encode()))
    xx2 = len(gzip.compress((" ".join([x, x2])).encode()))
    return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

train_ncd = np.array([[ncd(x_train[i], x_train[j]) for j in range(len(x_train))] for i in range(len(x_train))])

loo = LeaveOneOut()

k_values = [1, 3, 5]
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    accuracies = []
    tp_total, fp_total, fn_total, tn_total = 0, 0, 0, 0

    for train_index, test_index in loo.split(train_ncd):
        X_train, X_test = train_ncd[train_index], train_ncd[test_index]
        y_train_fold, y_test_fold = np.array(y_train)[train_index], np.array(y_train)[test_index]
        
        knn.fit(X_train, y_train_fold)
        y_pred = knn.predict(X_test)
        
        accuracy = knn.score(X_test, y_test_fold)
        accuracies.append(accuracy)

        cm = confusion_matrix(y_test_fold, y_pred)
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        else:
            tn, fp, fn, tp = 0, 0, 0, 0
            if y_test_fold[0] == y_pred[0] == 1:
                tp = cm[0, 0]
            elif y_test_fold[0] == y_pred[0] == 0:
                tn = cm[0, 0]
            elif y_pred[0] == 1:
                fp = cm[0, 0]
            else:
                fn = cm[0, 0]
        
        tp_total += tp
        fp_total += fp
        fn_total += fn
        tn_total += tn

        print(f'k={k}, Test sample {test_index[0]} - Accuracy: {accuracy}')
        print(f'TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}')
        
    mean_accuracy = np.mean(accuracies)

    precision_alzheimer = tp_total / (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
    precision_control = tn_total / (tn_total + fn_total) if (tn_total + fn_total) > 0 else 0

    print(f'Mean Leave-One-Out Cross-Validation Accuracy for k={k}: {mean_accuracy}')
    print(f'Precision for Alzheimer (Class 1) for k={k}: {precision_alzheimer}')
    print(f'Precision for Control (Class 0) for k={k}: {precision_control}')
    print(f'Total TP: {tp_total}, FP: {fp_total}, FN: {fn_total}, TN: {tn_total}')
    print('-'*50)

k=1, Test sample 0 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 1 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 2 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 3 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 4 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 5 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 6 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 7 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 8 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 9 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 10 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 11 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 12 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 13 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 14 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 15 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=

### loso_treino

In [4]:
import brotli
import lz4.frame
import lzma
import bz2
import pysmaz
import fpzip
import snappy
from pybcl import huffman_compress
from pybcl import lz_compress_fast
from pybcl import rice_compress, RiceFormat
from pybcl import rle_compress
from pybcl import sf_compress
import zstandard as zstd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix

def text_to_float_array(text):
    return np.array([ord(c) for c in text], dtype=np.float32)

def compress_data(data, algorithm):
    if algorithm == 'brotli':
        return len(brotli.compress(data.encode()))
    elif algorithm == 'lz4':
        return len(lz4.frame.compress(data.encode()))
    elif algorithm == 'lzma':
        return len(lzma.compress(data.encode()))
    elif algorithm == 'bz2':
        return len(bz2.compress(data.encode()))
    elif algorithm == 'pysmaz':
        return len(pysmaz.compress(data.encode()))
    elif algorithm == 'fpzip':
        data_array = text_to_float_array(data)
        return len(fpzip.compress(data_array))
    elif algorithm == 'snappy':
        return len(snappy.compress(data.encode()))
    elif algorithm == "huffman":
        return len(huffman_compress(data.encode()))
    elif algorithm == "lz":
        return len(lz_compress_fast(data.encode()))
    elif algorithm == 'rice':
        return len(rice_compress(data.encode(), RiceFormat.UINT8))
    elif algorithm == "rle":
        return len(rle_compress(data.encode()))
    elif algorithm == 'sf':
        return len(sf_compress(data.encode()))
    elif algorithm == 'zstd':
        return len(zstd.ZstdCompressor().compress(data.encode()))
    else:
        raise ValueError(f'Algorithm {algorithm} não supported.')

def ncd(x, x2, algorithm):
    x_compressed = compress_data(x, algorithm)
    x2_compressed = compress_data(x2, algorithm)
    xx2 = compress_data(" ".join([x, x2]), algorithm)
    return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

algorithms = ['brotli', 'lz4', 'lzma', 'bz2', 'pysmaz', 'fpzip', 'snappy', "huffman", "lz", 'rice', "rle", 'sf', 'zstd']

loo = LeaveOneOut()

k_values = [1, 3, 5]

for algorithm in algorithms:
    print(f'Using: {algorithm}')
    train_ncd = np.array([[ncd(x_train[i], x_train[j], algorithm) for j in range(len(x_train))] for i in range(len(x_train))])

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        accuracies = []
        tp_total, fp_total, fn_total, tn_total = 0, 0, 0, 0

        for train_index, test_index in loo.split(train_ncd):
            X_train, X_test = train_ncd[train_index], train_ncd[test_index]
            y_train_fold, y_test_fold = np.array(y_train)[train_index], np.array(y_train)[test_index]
            
            knn.fit(X_train, y_train_fold)
            y_pred = knn.predict(X_test)
            
            accuracy = knn.score(X_test, y_test_fold)
            accuracies.append(accuracy)

            cm = confusion_matrix(y_test_fold, y_pred)
            if cm.shape == (2, 2):
                tn, fp, fn, tp = cm.ravel()
            else:
                tn, fp, fn, tp = 0, 0, 0, 0
                if y_test_fold[0] == y_pred[0] == 1:
                    tp = cm[0, 0]
                elif y_test_fold[0] == y_pred[0] == 0:
                    tn = cm[0, 0]
                elif y_pred[0] == 1:
                    fp = cm[0, 0]
                else:
                    fn = cm[0, 0]
            
            tp_total += tp
            fp_total += fp
            fn_total += fn
            tn_total += tn

            print(f'k={k}, Test sample {test_index[0]} - Accuracy: {accuracy}')
            print(f'TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}')
            
        mean_accuracy = np.mean(accuracies)

        precision_alzheimer = tp_total / (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
        precision_control = tn_total / (tn_total + fn_total) if (tn_total + fn_total) > 0 else 0

        print(f'Mean Leave-One-Out Cross-Validation Accuracy for k={k} using {algorithm}: {mean_accuracy}')
        print(f'Precision for Alzheimer (Class 1) for k={k} using {algorithm}: {precision_alzheimer}')
        print(f'Precision for Control (Class 0) for k={k} using {algorithm}: {precision_control}')
        print(f'Total TP: {tp_total}, FP: {fp_total}, FN: {fn_total}, TN: {tn_total}')
        print('-'*50)

Using: brotli
k=1, Test sample 0 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 1 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 2 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 3 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 4 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 5 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 6 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 7 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 8 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 9 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 10 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 11 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 12 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 13 - Accuracy: 1.0
TP: 1, FP: 0, FN: 0, TN: 0
k=1, Test sample 14 - Accuracy: 0.0
TP: 0, FP: 0, FN: 1, TN: 0
k=1, Test sample 15 - Accuracy: 1.0
TP: 1, FP: 0, F

### holdout_teste

In [23]:
import gzip
import brotli
import lz4.frame
import lzma
import bz2
import pysmaz
import fpzip
import snappy
from pybcl import huffman_compress
from pybcl import lz_compress_fast
from pybcl import rice_compress, RiceFormat
from pybcl import rle_compress
from pybcl import sf_compress
import zstandard as zstd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

def text_to_float_array(text):
    return np.array([ord(c) for c in text], dtype=np.float32)

def compress_data(data, algorithm):
    if algorithm == 'gzip':
        return len(gzip.compress(data.encode()))
    elif algorithm == 'brotli':
        return len(brotli.compress(data.encode()))
    elif algorithm == 'lz4':
        return len(lz4.frame.compress(data.encode()))
    elif algorithm == 'lzma':
        return len(lzma.compress(data.encode()))
    elif algorithm == 'bz2':
        return len(bz2.compress(data.encode()))
    elif algorithm == 'pysmaz':
        return len(pysmaz.compress(data.encode()))
    elif algorithm == 'fpzip':
        data_array = text_to_float_array(data)
        return len(fpzip.compress(data_array))
    elif algorithm == 'snappy':
        return len(snappy.compress(data.encode()))
    elif algorithm == "huffman":
        return len(huffman_compress(data.encode()))
    elif algorithm == "lz":
        return len(lz_compress_fast(data.encode()))
    elif algorithm == 'rice':
        return len(rice_compress(data.encode(), RiceFormat.UINT8))
    elif algorithm == "rle":
        return len(rle_compress(data.encode()))
    elif algorithm == 'sf':
        return len(sf_compress(data.encode()))
    elif algorithm == 'zstd':
        return len(zstd.ZstdCompressor().compress(data.encode()))
    else:
        raise ValueError(f'Algorithm {algorithm} não supported.')

def ncd(x, x2, algorithm):
    x_compressed = compress_data(x, algorithm)
    x2_compressed = compress_data(x2, algorithm)
    xx2 = compress_data(" ".join([x, x2]), algorithm)
    return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

def load_test_data(test_file, labels_file):
    with open(test_file, 'r') as f:
        x_test = f.read().splitlines()

    y_test = []
    with open(labels_file, 'r') as f:
        next(f)  # Skip header
        for line in f:
            id_, dx = line.strip().split(',')
            print(f'Linha lida: id={id_}, dx={dx}')  # Imprime a linha lida
            if dx == 'ProbableAD':
                y_test.append(1)
            else:
                y_test.append(0)

    print("y_test:", y_test)  # Verifique o conteúdo de y_test

    # Agora estamos assumindo que a ordem das linhas do txt e do CSV corresponde
    return x_test, y_test

# Carregando os dados de teste
test_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\test.txt"
labels_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\test_labels.csv"
x_test, y_test = load_test_data(test_file, labels_file)

algorithms = ['gzip', 'brotli', 'lz4', 'lzma', 'bz2', 'pysmaz', 'fpzip', 'snappy', "huffman", "lz", 'rice', "rle", 'sf', 'zstd']

k_values = [1, 3, 5]

for algorithm in algorithms:
    print(f'Using: {algorithm}')
    train_ncd = np.array([[ncd(x_train[i], x_train[j], algorithm) for j in range(len(x_train))] for i in range(len(x_train))])
    test_ncd = np.array([[ncd(x_train[i], x_test[j], algorithm) for j in range(len(x_test))] for i in range(len(x_train))])

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(train_ncd, y_train)
        y_pred = knn.predict(test_ncd.T)  # Usando a transposta para combinar as dimensões

        cm = confusion_matrix(y_test, y_pred)
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        else:
            tn, fp, fn, tp = 0, 0, 0, 0
            if y_test[0] == y_pred[0] == 1:
                tp = cm[0, 0]
            elif y_test[0] == y_pred[0] == 0:
                tn = cm[0, 0]
            elif y_pred[0] == 1:
                fp = cm[0, 0]
            else:
                fn = cm[0, 0]

        accuracy = (tp + tn) / len(y_test)
        precision_alzheimer = tp / (tp + fp) if (tp + fp) > 0 else 0
        precision_control = tn / (tn + fn) if (tn + fn) > 0 else 0

        print(f'k={k}, Test Accuracy using {algorithm}: {accuracy}')
        print(f'Precision for Alzheimer (Class 1) for k={k} using {algorithm}: {precision_alzheimer}')
        print(f'Precision for Control (Class 0) for k={k} using {algorithm}: {precision_control}')
        print(f'TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}')
        print('-'*50)


Linha lida: id=adrsdt1, dx=ProbableAD
Linha lida: id=adrsdt10, dx=Control
Linha lida: id=adrsdt11, dx=Control
Linha lida: id=adrsdt12, dx=Control
Linha lida: id=adrsdt13, dx=ProbableAD
Linha lida: id=adrsdt14, dx=ProbableAD
Linha lida: id=adrsdt15, dx=Control
Linha lida: id=adrsdt16, dx=ProbableAD
Linha lida: id=adrsdt17, dx=Control
Linha lida: id=adrsdt18, dx=ProbableAD
Linha lida: id=adrsdt19, dx=ProbableAD
Linha lida: id=adrsdt2, dx=ProbableAD
Linha lida: id=adrsdt20, dx=ProbableAD
Linha lida: id=adrsdt21, dx=Control
Linha lida: id=adrsdt22, dx=Control
Linha lida: id=adrsdt23, dx=ProbableAD
Linha lida: id=adrsdt24, dx=ProbableAD
Linha lida: id=adrsdt25, dx=ProbableAD
Linha lida: id=adrsdt26, dx=Control
Linha lida: id=adrsdt27, dx=Control
Linha lida: id=adrsdt28, dx=ProbableAD
Linha lida: id=adrsdt29, dx=ProbableAD
Linha lida: id=adrsdt3, dx=Control
Linha lida: id=adrsdt30, dx=Control
Linha lida: id=adrsdt31, dx=Control
Linha lida: id=adrsdt32, dx=ProbableAD
Linha lida: id=adrsdt33, 

### loso_treino_e_holdout

In [27]:
import gzip
import brotli
import lz4.frame
import lzma
import bz2
import pysmaz
import fpzip
import snappy
from pybcl import huffman_compress
from pybcl import lz_compress_fast
from pybcl import rice_compress, RiceFormat
from pybcl import rle_compress
from pybcl import sf_compress
import zstandard as zstd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut

def text_to_float_array(text):
    return np.array([ord(c) for c in text], dtype=np.float32)

def compress_data(data, algorithm):
    if algorithm == 'gzip':
        return len(gzip.compress(data.encode()))
    elif algorithm == 'brotli':
        return len(brotli.compress(data.encode()))
    elif algorithm == 'lz4':
        return len(lz4.frame.compress(data.encode()))
    elif algorithm == 'lzma':
        return len(lzma.compress(data.encode()))
    elif algorithm == 'bz2':
        return len(bz2.compress(data.encode()))
    elif algorithm == 'pysmaz':
        return len(pysmaz.compress(data.encode()))
    elif algorithm == 'fpzip':
        data_array = text_to_float_array(data)
        return len(fpzip.compress(data_array))
    elif algorithm == 'snappy':
        return len(snappy.compress(data.encode()))
    elif algorithm == "huffman":
        return len(huffman_compress(data.encode()))
    elif algorithm == "lz":
        return len(lz_compress_fast(data.encode()))
    elif algorithm == 'rice':
        return len(rice_compress(data.encode(), RiceFormat.UINT8))
    elif algorithm == "rle":
        return len(rle_compress(data.encode()))
    elif algorithm == 'sf':
        return len(sf_compress(data.encode()))
    elif algorithm == 'zstd':
        return len(zstd.ZstdCompressor().compress(data.encode()))
    else:
        raise ValueError(f'Algorithm {algorithm} não supported.')

def ncd(x, x2, algorithm):
    x_compressed = compress_data(x, algorithm)
    x2_compressed = compress_data(x2, algorithm)
    xx2 = compress_data(" ".join([x, x2]), algorithm)
    return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

def load_train_data(ad_file, cn_file):
    with open(ad_file, 'r') as f_ad:
        x_train_ad = f_ad.read().splitlines()

    with open(cn_file, 'r') as f_cn:
        x_train_cn = f_cn.read().splitlines()

    x_train = x_train_ad + x_train_cn
    y_train = [1] * len(x_train_ad) + [0] * len(x_train_cn)

    return x_train, y_train

def load_test_data(test_file, labels_file):
    with open(test_file, 'r') as f:
        x_test = f.read().splitlines()

    y_test = []
    with open(labels_file, 'r') as f:
        next(f)  # Skip header
        for line in f:
            id_, dx = line.strip().split(',')
            if dx == 'ProbableAD':
                y_test.append(1)
            else:
                y_test.append(0)

    return x_test, y_test

# Carregando os dados de treino e teste
train_ad_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\train_ad.txt"
train_cn_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\train_cn.txt"
x_train, y_train = load_train_data(train_ad_file, train_cn_file)

test_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\test.txt"
labels_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\test_labels.csv"
x_test, y_test = load_test_data(test_file, labels_file)

# Combinando os conjuntos de treino e teste
x_combined = x_train + x_test
y_combined = y_train + y_test

algorithms = ['gzip', 'brotli', 'lz4', 'lzma', 'bz2', 'pysmaz', 'fpzip', 'snappy', "huffman", "lz", 'rice', "rle", 'sf', 'zstd']

k_values = [1, 3, 5]

for algorithm in algorithms:
    print(f'Using: {algorithm}')
    
    for k in k_values:
        loo = LeaveOneOut()
        y_true = []
        y_pred = []
        
        for train_index, test_index in loo.split(x_combined):
            x_train_loo, x_test_loo = [x_combined[i] for i in train_index], [x_combined[i] for i in test_index]
            y_train_loo, y_test_loo = [y_combined[i] for i in train_index], [y_combined[i] for i in test_index]
            
            # Calculando a matriz de distâncias NCD para o conjunto de treino
            train_ncd = np.array([[ncd(x_train_loo[i], x_train_loo[j], algorithm) for j in range(len(x_train_loo))] for i in range(len(x_train_loo))])
            test_ncd = np.array([ncd(x_train_loo[i], x_test_loo[0], algorithm) for i in range(len(x_train_loo))])

            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(train_ncd, y_train_loo)
            y_pred_loo = knn.predict([test_ncd])[0]

            y_true.append(y_test_loo[0])
            y_pred.append(y_pred_loo)

        # Calculando a matriz de confusão e métricas
        cm = confusion_matrix(y_true, y_pred)
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        else:
            tn, fp, fn, tp = 0, 0, 0, 0
            if y_true[0] == y_pred[0] == 1:
                tp = cm[0, 0]
            elif y_true[0] == y_pred[0] == 0:
                tn = cm[0, 0]
            elif y_pred[0] == 1:
                fp = cm[0, 0]
            else:
                fn = cm[0, 0]

        accuracy = (tp + tn) / len(y_true)
        precision_alzheimer = tp / (tp + fp) if (tp + fp) > 0 else 0
        precision_control = tn / (tn + fn) if (tn + fn) > 0 else 0

        print(f'k={k}, Test Accuracy using {algorithm}: {accuracy}')
        print(f'Precision for Alzheimer (Class 1) for k={k} using {algorithm}: {precision_alzheimer}')
        print(f'Precision for Control (Class 0) for k={k} using {algorithm}: {precision_control}')
        print(f'TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}')
        print('-'*50)

Using: gzip
k=1, Test Accuracy using gzip: 0.70042194092827
Precision for Alzheimer (Class 1) for k=1 using gzip: 0.717948717948718
Precision for Control (Class 0) for k=1 using gzip: 0.6833333333333333
TP: 84, FP: 33, FN: 38, TN: 82
--------------------------------------------------
k=3, Test Accuracy using gzip: 0.7088607594936709
Precision for Alzheimer (Class 1) for k=3 using gzip: 0.7226890756302521
Precision for Control (Class 0) for k=3 using gzip: 0.6949152542372882
TP: 86, FP: 33, FN: 36, TN: 82
--------------------------------------------------
k=5, Test Accuracy using gzip: 0.7130801687763713
Precision for Alzheimer (Class 1) for k=5 using gzip: 0.7327586206896551
Precision for Control (Class 0) for k=5 using gzip: 0.6942148760330579
TP: 85, FP: 31, FN: 37, TN: 84
--------------------------------------------------
Using: brotli
k=1, Test Accuracy using brotli: 0.5907172995780591
Precision for Alzheimer (Class 1) for k=1 using brotli: 0.5899280575539568
Precision for Control 

: 

In [1]:
import gzip
import brotli
import lz4.frame
import lzma
import bz2
import pysmaz
import fpzip
import snappy
from pybcl import huffman_compress
from pybcl import lz_compress_fast
from pybcl import rice_compress, RiceFormat
from pybcl import rle_compress
from pybcl import sf_compress
import zstandard as zstd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut

def text_to_float_array(text):
    return np.array([ord(c) for c in text], dtype=np.float32)

def compress_data(data, algorithm):
    if algorithm == 'gzip':
        return len(gzip.compress(data.encode()))
    elif algorithm == 'brotli':
        return len(brotli.compress(data.encode()))
    elif algorithm == 'lz4':
        return len(lz4.frame.compress(data.encode()))
    elif algorithm == 'lzma':
        return len(lzma.compress(data.encode()))
    elif algorithm == 'bz2':
        return len(bz2.compress(data.encode()))
    elif algorithm == 'pysmaz':
        return len(pysmaz.compress(data.encode()))
    elif algorithm == 'fpzip':
        data_array = text_to_float_array(data)
        return len(fpzip.compress(data_array))
    elif algorithm == 'snappy':
        return len(snappy.compress(data.encode()))
    elif algorithm == "huffman":
        return len(huffman_compress(data.encode()))
    elif algorithm == "lz":
        return len(lz_compress_fast(data.encode()))
    elif algorithm == 'rice':
        return len(rice_compress(data.encode(), RiceFormat.UINT8))
    elif algorithm == "rle":
        return len(rle_compress(data.encode()))
    elif algorithm == 'sf':
        return len(sf_compress(data.encode()))
    elif algorithm == 'zstd':
        return len(zstd.ZstdCompressor().compress(data.encode()))
    else:
        raise ValueError(f'Algorithm {algorithm} não supported.')

def ncd(x, x2, algorithm):
    x_compressed = compress_data(x, algorithm)
    x2_compressed = compress_data(x2, algorithm)
    xx2 = compress_data(" ".join([x, x2]), algorithm)
    return (xx2 - min(x_compressed, x2_compressed)) / max(x_compressed, x2_compressed)

def load_train_data(ad_file, cn_file):
    with open(ad_file, 'r') as f_ad:
        x_train_ad = f_ad.read().splitlines()

    with open(cn_file, 'r') as f_cn:
        x_train_cn = f_cn.read().splitlines()

    x_train = x_train_ad + x_train_cn
    y_train = [1] * len(x_train_ad) + [0] * len(x_train_cn)

    return x_train, y_train

def load_test_data(test_file, labels_file):
    with open(test_file, 'r') as f:
        x_test = f.read().splitlines()

    y_test = []
    with open(labels_file, 'r') as f:
        next(f)  # Skip header
        for line in f:
            id_, dx = line.strip().split(',')
            if dx == 'ProbableAD':
                y_test.append(1)
            else:
                y_test.append(0)

    return x_test, y_test

# Carregando os dados de treino e teste
train_ad_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\train_ad.txt"
train_cn_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\train_cn.txt"
x_train, y_train = load_train_data(train_ad_file, train_cn_file)

test_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\test.txt"
labels_file = r"C:\Users\Lenovo\Downloads\ADReSSo21\diagnosis\test_labels.csv"
x_test, y_test = load_test_data(test_file, labels_file)

# Combinando os conjuntos de treino e teste
x_combined = x_train + x_test
y_combined = y_train + y_test

algorithms = ['brotli', 'lz4', 'lzma', 'bz2', 'pysmaz', 'fpzip', 'snappy', "huffman", "lz", 'rice', "rle", 'sf', 'zstd']

k_values = [3, 5]

for algorithm in algorithms:
    print(f'Using: {algorithm}')
    
    for k in k_values:
        loo = LeaveOneOut()
        y_true = []
        y_pred = []
        
        for train_index, test_index in loo.split(x_combined):
            x_train_loo, x_test_loo = [x_combined[i] for i in train_index], [x_combined[i] for i in test_index]
            y_train_loo, y_test_loo = [y_combined[i] for i in train_index], [y_combined[i] for i in test_index]
            
            # Calculando a matriz de distâncias NCD para o conjunto de treino
            train_ncd = np.array([[ncd(x_train_loo[i], x_train_loo[j], algorithm) for j in range(len(x_train_loo))] for i in range(len(x_train_loo))])
            test_ncd = np.array([ncd(x_train_loo[i], x_test_loo[0], algorithm) for i in range(len(x_train_loo))])

            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(train_ncd, y_train_loo)
            y_pred_loo = knn.predict([test_ncd])[0]

            y_true.append(y_test_loo[0])
            y_pred.append(y_pred_loo)

        # Calculando a matriz de confusão e métricas
        cm = confusion_matrix(y_true, y_pred)
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        else:
            tn, fp, fn, tp = 0, 0, 0, 0
            if y_true[0] == y_pred[0] == 1:
                tp = cm[0, 0]
            elif y_true[0] == y_pred[0] == 0:
                tn = cm[0, 0]
            elif y_pred[0] == 1:
                fp = cm[0, 0]
            else:
                fn = cm[0, 0]

        accuracy = (tp + tn) / len(y_true)
        precision_alzheimer = tp / (tp + fp) if (tp + fp) > 0 else 0
        precision_control = tn / (tn + fn) if (tn + fn) > 0 else 0

        print(f'k={k}, Test Accuracy using {algorithm}: {accuracy}')
        print(f'Precision for Alzheimer (Class 1) for k={k} using {algorithm}: {precision_alzheimer}')
        print(f'Precision for Control (Class 0) for k={k} using {algorithm}: {precision_control}')
        print(f'TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}')
        print('-'*50)

Using: brotli


## Regressão