In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from collections import Counter

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, LeaveOneOut
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_auc_score, recall_score, f1_score
from sklearn.svm import SVC, OneClassSVM
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2, SelectKBest

In [3]:
df = pd.read_excel('hcv.xlsx', "Recorte - Planilha (10SNP's)")

In [4]:
features = ['ptx3_0', 'ptx3_1', 'mbl', 'il_0', 'il_1', 'il_2', 'tnf', 'sod', 'mpo', 'il_3']

In [5]:
target = 'fibrose'

In [6]:
sorted(Counter(df[target]).items())

[('F0', 6), ('F1', 58), ('F2', 80), ('F3', 70), ('F4', 41), ('HCC', 42)]

In [7]:
df = df[(df[target] == 'HCC') | (df[target] == 'F4') | (df[target] == 'F3')]

In [8]:
X = df[features]
y = df[target]

In [9]:
sorted(Counter(y).items())

[('F3', 70), ('F4', 41), ('HCC', 42)]

In [198]:
def get_alleles(col):
    alleles = set()
    for snp in col.unique():
        alleles.add(snp[0])
        alleles.add(snp[1])
    return list(alleles)

In [199]:
def rec_col_map(col):
    dataset = {}
    alleles = get_alleles(col)
    for a in alleles:
        col_name = f'{col.name}_{a}'
        col_data = [int(a in c) for c in col.values]
        dataset[col_name] = col_data
    return dataset

In [200]:
def rec_encoding():
    dataset = {}
    for col in X:
        dataset.update(rec_col_map(X[col]))
    return pd.DataFrame(dataset)

In [201]:
X = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True).fit_transform(X)
X.head()

Unnamed: 0,ptx3_0_AG,ptx3_0_GG,ptx3_0_AA,ptx3_1_AG,ptx3_1_GG,ptx3_1_AA,mbl_YX,mbl_YY,mbl_XX,il_0_AA,...,tnf_AA,sod_AA,sod_GA,sod_GG,mpo_GA,mpo_GG,mpo_AA,il_3_TT,il_3_CT,il_3_CC
2,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
8,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
9,1,0,0,1,0,0,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
12,0,1,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
13,1,0,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0


In [202]:
# X = rec_encoding()
# X.head()

In [203]:
X = SelectKBest(chi2, k=2).fit_transform(X, y)

In [204]:
# X = PCA(n_components=3).fit_transform(X)
# X.shape

In [205]:
y = y.map(lambda x: 1 if x in ['HCC', 'F4'] else 0)
sorted(Counter(y).items())

[(0, 70), (1, 83)]

In [206]:
y_pred = []

In [207]:
for train_index, test_index in LeaveOneOut().split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    rfc = RandomForestClassifier(criterion='gini', max_depth=6, max_features='auto', n_estimators=100, class_weight='balanced')
    rfc = rfc.fit(X_train, y_train)
    y_pred.append(rfc.predict(X_test)[0])

In [208]:
precision_score(y, y_pred)

0.3157894736842105

In [209]:
accuracy_score(y, y_pred)

0.3660130718954248

In [210]:
roc_auc_score(y, y_pred)

0.3865748709122203

In [211]:
recall_score(y, y_pred)

0.14457831325301204

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=42)

In [19]:
rfc = RandomForestClassifier(random_state=42, class_weight='balanced')

In [20]:
svc = SVC(random_state=42, class_weight='balanced')

In [21]:
param_grid = { 
    'n_estimators': [100, 150, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [6, 7, 8],
    'criterion': ['gini', 'entropy']
}
cv_clf = GridSearchCV(cv=5, param_grid=param_grid, estimator=rfc, n_jobs=-1)

In [22]:
# param_grid = {
#     'C': [0.001, 0.01, 0.1, 1, 10],
#     'gamma': [0.001, 0.01, 0.1, 1],
#     'kernel': ['rbf', 'linear', 'poly']
# }
# cv_clf = GridSearchCV(cv=5, param_grid=param_grid, estimator=svc, n_jobs=-1)

In [23]:
cv_clf.fit(X_train, y_train)
cv_clf.best_params_

{'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'auto',
 'n_estimators': 100}

In [37]:
rfc = RandomForestClassifier(criterion='gini', max_depth=6, max_features='auto', n_estimators=100, class_weight='balanced')

In [38]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [39]:
rfc.feature_importances_

array([0.6308019, 0.3691981])

In [40]:
# svc = SVC(C=0.01, gamma=1, kernel='poly', class_weight='balanced')

In [41]:
# svc.fit(X_train, y_train)

In [29]:
y_pred_rf = rfc.predict(X_test)

In [30]:
# y_pred_sv = svc.predict(X_test)

In [31]:
confusion_matrix(y_test, y_pred_rf)

array([[5, 1],
       [2, 5]])

In [32]:
accuracy_score(y_test, y_pred_rf)

0.7692307692307693

In [33]:
roc_auc_score(y_test, y_pred_rf)

0.773809523809524

In [34]:
precision_score(y_test, y_pred_rf)

0.8333333333333334

In [35]:
y_pred_rf

array([0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1])

In [36]:
y_test.values

array([0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1])