## Build RFC based on all the other protein

In [42]:
import os
import pandas as pd
obs_files = [f for f in '../data/sub1/']
ground_truth_data = pd.read_csv('../data/sub1/data_true.txt', sep='\t')

In [87]:
genes = ground_truth_data['Gene_ID'].values
by_genes = ground_truth_data.set_index('Gene_ID').T
gene = genes[0]
y = by_genes[gene].as_matrix()
ycl = by_genes[gene].apply(lambda x: int(x > 0)).as_matrix()
X = by_genes.drop([gene], axis=1).as_matrix()

# Classifying low (y = 0) vs. normal (y = 1) with Balanced DataSet

In [120]:
def rfc_with_smote(X, ycl, n_estimators=32, min_samples_leaf=5, max_depth=3):
    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
    from sklearn.model_selection import KFold, cross_val_score
    from sklearn.model_selection import train_test_split
    from imblearn.combine import SMOTETomek
    sm = SMOTETomek()
    X_smt, y_smt = sm.fit_sample(X, ycl)
    X_train, X_test, y_train, y_test = train_test_split(X_smt, y_smt, test_size=0.20,
                                                        random_state=42)
    ycl_train = (y_train > 0).astype(int)
    ycl_test = (y_test > 0).astype(int)

    rfc = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, max_depth=max_depth)
    rfc.fit_transform(X_train, ycl_train)

    pred = pd.DataFrame(rfc.predict_proba(X_test), columns=['Prob_low', 'Prob_normal'])
    test_prediction_df = pd.concat([
        pd.DataFrame(ycl_test,columns=['grount_truth_class']), 
        pred], axis=1
    )
    
    return rfc, test_prediction_df

gene = genes[0]
y = by_genes[gene].as_matrix()
ycl = by_genes[gene].apply(lambda x: int(x > 0)).as_matrix()
X = by_genes.drop([gene], axis=1).as_matrix()

model, prediction = rfc_with_smote(X, ycl)



In [139]:
from sklearn.metrics import precision_recall_fscore_support
threshold = 0.5
y_pred = prediction['Prob_normal'].apply(lambda x: x > threshold)
precision, recall, f1, support = precision_recall_fscore_support(ycl_test, y_pred)

In [140]:
print(precision, recall, f1, support)

[ 1.          0.85714286] [ 0.875  1.   ] [ 0.93333333  0.92307692] [16 12]
