# Machine Learning for features

## Import packages

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import shap
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from tqdm import tqdm

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Load data

In [2]:
# load data
df_all = pd.read_csv('../data/paras.txt', sep='\t')

# 切分数据
df_ASD_AC = df_all[df_all['Disease'].isin(['ASD', 'ASD_Cancer'])]
df_Cancer_AC = df_all[df_all['Disease'].isin(['Cancer', 'ASD_Cancer'])]

## RFE

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Features and labels
X = df_Cancer_AC[['Co.evolution','Consurf_Score','Entropy','RASA','ddG',
             'Betweenness','Closeness','Degree','Eigenvector',
             'Clustering.coefficient','Effectiveness','Sensitivity',
             'MSF','DFI','Stiffness']]
y = df_Cancer_AC['Disease']
X1 = X
y1 = y
X1 = X1.reset_index(drop=True)
y1 = y1.reset_index(drop=True)
shuffle_index = np.random.permutation(X1.index)
X1 = X1.iloc[shuffle_index]
y1 = y1.iloc[shuffle_index]
y1_encode = y1.map({'ASD_Cancer': 1, 'Cancer': 0})
select_feature = []

# RFE
for i in range(15):
    model = LGBMClassifier(verbose = -1, n_estimators = 1000, max_depth = 5) 
    rfe = RFE(model, n_features_to_select= i+1)
    rfe.fit(X1, y1_encode)
    selected_features = X.columns[rfe.support_]
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X1[selected_features], y1_encode, cv=cv)
    print("----------------"+ str(i+1) + "---------------")
    selected_features = X1.columns[rfe.support_]
    print("selected features:", selected_features)
    print("cross validation score:", scores.mean())
    select_feature.append(selected_features)
    select_feature.append(scores.mean())

----------------1---------------
selected features: Index(['Closeness'], dtype='object')
cross validation score: 0.885413744740533
----------------2---------------
selected features: Index(['Closeness', 'Degree'], dtype='object')
cross validation score: 0.9610331930808791
----------------3---------------
selected features: Index(['Closeness', 'Degree', 'Effectiveness'], dtype='object')
cross validation score: 0.9783076203833566
----------------4---------------
selected features: Index(['Closeness', 'Degree', 'Eigenvector', 'Effectiveness'], dtype='object')
cross validation score: 0.9783076203833566
----------------5---------------
selected features: Index(['Betweenness', 'Closeness', 'Degree', 'Eigenvector', 'Effectiveness'], dtype='object')
cross validation score: 0.9761570827489481
----------------6---------------
selected features: Index(['Betweenness', 'Closeness', 'Degree', 'Eigenvector', 'Effectiveness',
       'Stiffness'],
      dtype='object')
cross validation score: 0.9761570

In [None]:
select_feature = 