In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

In [2]:
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_auc_score, recall_score, f1_score
from sklearn.svm import OneClassSVM
from sklearn.feature_selection import chi2, SelectKBest

In [4]:
df = pd.read_excel('hcv.xlsx', "Recorte - Planilha (10SNP's)")

In [5]:
features = ['ptx3_0', 'ptx3_1', 'mbl', 'il_0', 'il_1', 'il_2', 'tnf', 'sod', 'mpo', 'il_3']

In [6]:
target = 'fibrose'

In [7]:
sorted(Counter(df[target]).items())

[('F0', 6), ('F1', 58), ('F2', 80), ('F3', 70), ('F4', 41), ('HCC', 42)]

In [8]:
df = df[(df[target] == 'HCC') | (df[target] == 'F4')]

In [9]:
non_cancer_rows = df[features][df[target] != 'HCC']
non_cancer_targets = df[target][df[target] != 'HCC']

In [10]:
cancer_targets = df[target][df[target] == 'HCC']
cancer_rows = df[features][df[target] == 'HCC']

In [11]:
upper_index = int(len(cancer_rows) * 0.8)

In [12]:
X_train = cancer_rows[0 : upper_index]
y_train = cancer_targets[0 : upper_index]

In [13]:
X_test = pd.concat([cancer_rows[upper_index : ], non_cancer_rows])
y_test = pd.concat([cancer_targets[upper_index : ], non_cancer_targets])

In [14]:
def get_alleles(col):
    alleles = set()
    for snp in col.unique():
        alleles.add(snp[0])
        alleles.add(snp[1])
    return list(alleles)

In [15]:
def rec_col_map(col):
    dataset = {}
    alleles = get_alleles(col)
    for a in alleles:
        col_name = f'{col.name}_{a}'
        col_data = [int(a in c) for c in col.values]
        dataset[col_name] = col_data
    return dataset

In [16]:
def rec_encoding(X):
    dataset = {}
    for col in X:
        dataset.update(rec_col_map(X[col]))
    return pd.DataFrame(dataset)

In [17]:
X_train = rec_encoding(X_train)
y_train = y_train.map(lambda x: 1 if x == 'HCC' else 0)

In [18]:
X_test = rec_encoding(X_test)
y_test = y_test.map(lambda x: 1 if x == 'HCC' else 0)

In [None]:
X_train = SelectKBest(chi2, k=3).fit_transform(X_train)
X_test = SelectKBest(chi2, k=3).fit_transform(X_test)

In [19]:
one_svm = OneClassSVM(gamma='scale').fit(X_train)

In [20]:
y_pred = one_svm.predict(X_test)

In [21]:
y_pred

array([-1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1, -1,  1,
       -1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1])

In [22]:
y_test.values

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])