## Build RFC based on all the other protein

In [1]:
import os
import pandas as pd
obs_files = [f for f in '../data/sub1/']
ground_truth_data = pd.read_csv('../data/sub1/data_true.txt', sep='\t')

In [2]:
genes = ground_truth_data['Gene_ID'].values
by_genes = ground_truth_data.set_index('Gene_ID').T
gene = genes[0]
y = by_genes[gene].as_matrix()
ycl = by_genes[gene].apply(lambda x: int(x > 0)).as_matrix()
X = by_genes.drop([gene], axis=1).as_matrix()

# low (y = 0) vs. normal (y = 1) Classification with RFC + SMOTETomek

In [54]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_recall_fscore_support
from imblearn.combine import SMOTETomek, SMOTEENN
import pickle, gzip
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def rfc_with_smote(X, ycl, n_estimators=32, min_samples_leaf=5, max_depth=3):
    try: 
        sm = SMOTETomek(k=5)
        X_smt, y_smt = sm.fit_sample(X, ycl)
    except: 
        pass
    try: 
        sm = SMOTETomek(k=2)
        X_smt, y_smt = sm.fit_sample(X, ycl)
    except: 
        X_smt, y_smt = X, ycl
        
    X_train, X_test, y_train, y_test = train_test_split(X_smt, y_smt, test_size=0.20,
                                                        random_state=42)
    ycl_train = (y_train > 0).astype(int)
    ycl_test = (y_test > 0).astype(int)

    sfm = SelectFromModel(RandomForestClassifier(
        n_estimators=n_estimators, 
        min_samples_leaf=min_samples_leaf, 
        max_depth=max_depth
    ))
    sfm.fit(X_train, ycl_train)
    X_train = sfm.transform(X_train)
    X_test = sfm.transform(X_test)
    
    rfc = RandomForestClassifier(
        n_estimators=n_estimators, 
        min_samples_leaf=min_samples_leaf, 
        max_depth=max_depth
    )
    rfc.fit_transform(X_train, ycl_train)
    precision, recall, f1, support = evaluate_model(rfc, X_test, ycl_test, threshold=0.5)
        
    return rfc, precision, recall, f1, support

def save_model(model, model_name, model_output_path='../model/sub1/RFC/'):
    if not os.path.isdir(model_output_path):
        os.makedirs(model_output_path)

    with gzip.open(os.path.join(model_output_path, model_name + '.pkl.gz'), 'wb') as fm: 
        pickle.dump(model, fm)
        
def evaluate_model(model, X_test, y_test, threshold=0.5):
    pred = pd.DataFrame(model.predict_proba(X_test), columns=['Prob_low', 'Prob_normal'])
    pred_df = pd.concat([
        pd.DataFrame(y_test, columns=['grount_truth_class']), 
        pred], axis=1
    )
    y_pred = pred_df['Prob_normal'].apply(lambda x: x > threshold)
    precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred)
    return precision, recall, f1, support

In [56]:
# Train RFC model gene by gene
from collections import namedtuple
results_cols= ['model_name', 'precision', 'recall','f1','support']
RFCResult = namedtuple('RFCResult', ' '.join(results_cols))
should_save_model=False
should_save_results=True
RFCResults = []

for gene in genes: 
    model_name = 'RFC_{gene_name}'.format(gene_name=gene)
    print("Training RFC for gene: {}".format(model_name))
    y = by_genes[gene].as_matrix()
    ycl = by_genes[gene].apply(lambda x: int(x > 0)).as_matrix()
    X = by_genes.drop([gene], axis=1).as_matrix()

    model, precision, recall, f1, support = rfc_with_smote(X, ycl)
    RFCResults.append(RFCResult(
        model_name=model_name,
        precision=precision,
        recall=recall,
        f1=f1,
        support=support
    ))
    
    if should_save_model: 
        save_model(model, model_name)

if should_save_results: 
    result_output_path = '../data/sub1/RFC/summary/'
    if not os.path.isdir(result_output_path):
        os.makedirs(result_output_path)
    pd.DataFrame.from_records(RFCResults, columns=results_cols).to_csv(
        os.path.join(result_output_path, 'model_performances.csv')
        , index=False)

In [57]:
pd.DataFrame.from_records(RFCResults, columns=results_cols)

Unnamed: 0,model_name,precision,recall,f1,support
0,RFC_Protein_1,"[1.0, 0.923076923077]","[0.9375, 1.0]","[0.967741935484, 0.96]","[16, 12]"
1,RFC_Protein_2,"[1.0, 1.0]","[1.0, 1.0]","[1.0, 1.0]","[13, 15]"
2,RFC_Protein_3,"[1.0, 0.875]","[0.857142857143, 1.0]","[0.923076923077, 0.933333333333]","[14, 14]"
3,RFC_Protein_4,"[1.0, 0.882352941176]","[0.857142857143, 1.0]","[0.923076923077, 0.9375]","[14, 15]"
4,RFC_Protein_5,"[1.0, 0.941176470588]","[0.923076923077, 1.0]","[0.96, 0.969696969697]","[13, 16]"
5,RFC_Protein_6,"[1.0, 1.0]","[1.0, 1.0]","[1.0, 1.0]","[16, 15]"
6,RFC_Protein_7,"[1.0, 0.857142857143]","[0.875, 1.0]","[0.933333333333, 0.923076923077]","[16, 12]"
7,RFC_Protein_8,"[1.0, 1.0]","[1.0, 1.0]","[1.0, 1.0]","[11, 17]"
8,RFC_Protein_9,"[1.0, 0.944444444444]","[0.916666666667, 1.0]","[0.95652173913, 0.971428571429]","[12, 17]"
9,RFC_Protein_10,"[1.0, 0.941176470588]","[0.916666666667, 1.0]","[0.95652173913, 0.969696969697]","[12, 16]"
