# After Using Previous Notebooks Use This Notebook to Make Predictions

In [1]:
# Imports --- All of this may not be vital


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss, f1_score, fbeta_score, recall_score, precision_score, confusion_matrix



from pprint import pprint
from sklearn.preprocessing import MinMaxScaler


from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import *

# Homemade functions required
from data_prep_functions import *
from interpro_scraping import interpro_scraping_pandas

### Import Data

In [2]:
### import data used to train classifiers ###

plasma_total_data_names = pd.read_excel("data/"+'gt15_plasma_features_names_biopy_gravy.xlsx', header=0, index_col=0)
# gt6_data = pd.read_excel("data/"+'gt6_plasma_features_names_biopy.xlsx', header=0, index_col=0)
csf_total_data_names = pd.read_excel("data/"+'gt15_csf_features_names_biopy_gravy.xlsx', header=0,index_col=0)


## sort into names and features
features_plasma = plasma_total_data_names.copy()
features_plasma = features_plasma.drop(['Corona'], axis=1)
names_plasma = plasma_total_data_names['Corona'].copy()

features_csf = csf_total_data_names.copy()
features_csf = features_csf.drop(['Corona'], axis=1) 
names_csf = csf_total_data_names['Corona'].copy()

### create a merged set
features_plasma_labeled = features_plasma.copy()
features_csf_labeled = features_csf.copy()

features_plasma_labeled['phase_plasma'] = 1
features_csf_labeled['phase_plasma'] = 0

# Modified by Firebird 10/16/2025 to be able to use with python 3.12 
#features_merged = features_plasma_labeled.append(features_csf_labeled, ignore_index=True)
#names_merged = names_plasma.append(names_csf, ignore_index=True)

features_merged = pd.concat([features_plasma_labeled, features_csf_labeled], ignore_index=True)
names_merged = pd.concat([names_plasma, names_csf], ignore_index=True)


# set with no phase labeling names are identical to names merged
features_merged_naive = features_merged.drop(['phase_plasma'], axis=1)

# print(plasma_total_data_names.shape, csf_total_data_names.shape, features_test.shape) ## in case you need to see shapes


## there is a known error here, sometimes there is an Unnamed column just drop it code is available in a 
#lower cell (scaling cell), its a holdover from two merged set

# tf_data = features_merged_naive.copy()
# tf_data['names'] = names_merged.copy()
# tf_data.to_excel('data_for_tensorflow.xlsx')


In [3]:
#### revisions 

total_data_for_reductions = features_merged_naive.copy()
total_data_for_reductions['Corona'] = names_merged

total_data_reduced = total_data_for_reductions.drop_duplicates(subset=['Protein names'])
total_data_reduced.shape

names_reduced = total_data_reduced['Corona']
features_reduced = total_data_reduced.drop('Corona', axis=1)
reduced_protein_names = features_reduced['Protein names']

In [None]:
## Use this space to import test data ### 


#features_test = pd.read_excel("data/"+'proteins_selected_for_testing_complete_updated.xlsx', header=0, index_col = 0)
features_test = pd.read_excel("data/"+'netsurfp_2_proteins_selected_for_testing_processed_updated.xlsx', header=0, index_col = 0)

# uncomment below for large verification runs with labels

# features_test = pd.read_excel("data/"+'pnp_csf_features_names_biopy_gravy.xlsx', header=0, index_col = 0)
# y_test_test = features_test['Corona'].copy()
# features_test = features_test.drop(['Corona'], axis=1)


In [7]:
### if were not going to be using NETSURFP 

#features_for_prediction = pd.read_excel("data/"+'proteins_selected_for_testing_complete.xlsx', header=0, index_col = 0)
features_for_prediction = pd.read_excel("data/"+'netsurfp_2_proteins_selected_for_testing_processed_updated.xlsx', header=0, index_col = 0)

# print(list(features_for_prediction.columns))
# subset_features = features_merged_naive[list(features_for_prediction.columns)]

### Scale Data To Make it Work Well

In [None]:
scaler = MinMaxScaler()
total_data = features_merged_naive.copy()  ## for a regular netsurfp included case
# total_data = subset_features.copy() ### for a subset case --- use this one
total_data = total_data.fillna(0)
total_data_with_names = total_data.copy()
total_data = total_data.drop(['Protein names', 'mass' ], axis=1)
scaler = scaler.fit(total_data)
scaled_df = pd.DataFrame(scaler.transform(total_data), columns=total_data.columns)
print(scaled_df.shape)


scaled_df_phase = scaled_df.copy()
scaled_df_phase['phase_plasma'] = features_merged['phase_plasma'].copy()

plasma_data = scaled_df_phase[scaled_df_phase.phase_plasma==1]
plasma_data = plasma_data.drop(['phase_plasma'], axis=1)
scaled_df_plasma = plasma_data #pd.DataFrame(scaler.transform(plasma_data), columns=plasma_data.columns)

csf_data = scaled_df_phase[scaled_df_phase.phase_plasma==0]
csf_data = csf_data.drop(['phase_plasma'], axis=1)
scaled_df_csf = csf_data #pd.DataFrame(scaler.transform(csf_data), columns=csf_data.columns)

### UNCOMMENT this section for a REGULAR RUN
#features = features_merged_naive.copy()  # change the dataframe that you want to use here
features_test = features_test.fillna(0)
features_test_names = features_test.copy()
#print(features_test.axes)
#features_test = features_test.drop(['Protein names', 'mass'], axis=1) #,'entry'
scaled_test_df = pd.DataFrame(scaler.transform(features_test), columns=features_test.columns)

# features_reduced = features_reduced.drop(['Protein names'], axis=1)
# scaled_reduced_df = pd.DataFrame(scaler.transform(features_reduced), columns=features_reduced.columns)


# scaled_df = scaled_df.drop(['Unnamed: 0.1'], axis=1)
# scaled_df_phase = scaled_df_phase.drop(['Unnamed: 0.1'], axis=1)
# scaled_test_df = scaled_test_df.drop(['Unnamed: 0.1'], axis=1)

In [13]:
gt6_data = pd.read_excel("data/"+'pnp_plasma_features_names_biopy_gravy.xlsx', header=0, index_col=0)
features_gt6 = gt6_data.copy()
features_gt6 = features_gt6.drop(['Corona'], axis=1)
names_gt6 = gt6_data['Corona'].copy()

features_gt6_combined = pd.concat([features_merged_naive, features_gt6], ignore_index=True)
names_gt6_combined = pd.concat([names_merged, names_gt6], ignore_index=True)
features_gt6_combined = features_gt6_combined.drop(columns=['Protein names', 'mass'])
total_data_col_drop = total_data.copy()

# scaler = scaler.fit(features_gt6_combined)
scaled_df_gt6 = pd.DataFrame(scaler.transform(features_gt6_combined), columns=total_data_col_drop.columns)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Unnamed: 0


### Data to be Put into classifier

In [14]:
### Data put into classifier and classified
s_df = scaled_df#.drop(['Unnamed: 0.1', 'mass' ], axis=1)
scaled_test_df = scaled_test_df#.drop(['Unnamed: 0.1', 'mass'], axis=1)

df_local_features_train = s_df.copy()
# df_local_protein_names_train = features_merged['Protein names'].copy()#reduced_protein_names.copy()
df_local_names = names_merged.copy()

df_local_features_classify = scaled_test_df.copy() #.drop(['Unnamed: 0.1'], axis=1)
df_local_protein_names_classify = features_test_names['Protein names'] #.copy()

### to keep some things kosher later
df_local_features_train_copy = s_df.copy()
df_local_names_copy = names_merged.copy()
df_local_features_classify_copy = df_local_features_classify.copy()
df_local_protein_names_classify_copy = df_local_protein_names_classify.copy()

NameError: name 'scaled_test_df' is not defined

# Classifier

In [237]:
k_fold_splits = 100
predictions = pd.DataFrame()

X_new = SelectKBest(f_classif, k=38).fit_transform(df_local_features_train.copy(), df_local_names.copy()) #X_train_set.values #
df_local_features_train = pd.DataFrame(X_new.copy())#df_local_features_train.copy() #pd.DataFrame(X_new.copy()) #scaled_df.copy()

rndm_ste=2016
feature_imp = pd.DataFrame(columns=list(df_local_features_train.columns))
first_frame = True
correctness_frame = pd.DataFrame()
metrics_frame = pd.DataFrame()




set_size_adjust = (scaled_df.shape[0]/scaled_df_plasma.shape[0]) *.1 # used to retain the same number of samples in the test set, replace test_size with it if using
# #split up our data
i = 0

sss = StratifiedShuffleSplit(n_splits=k_fold_splits, test_size=0.1, random_state=rndm_ste)

for train_index, test_index in sss.split(df_local_features_train, df_local_names): # comment our if doing cross fluid
# for train_index, test_index in sss.split(scaled_df_plasma, names_plasma): # use for cross fluid tests, verify correct dataset placed here
    X_train = df_local_features_train.iloc[train_index] # remove subsetting for cross fluid tests
    X_test = df_local_features_train.iloc[test_index] # change dataframe for cross fluid tests
    y_train = df_local_names.iloc[train_index] # remove subsetting for cross fluid tests
    y_test = df_local_names.iloc[test_index] # change dataframe for cross fluid tests


     
    # Create and Train
    rfc=RandomForestClassifier(criterion='entropy', min_impurity_decrease = 0.02,  min_samples_split=2, max_depth = 10, max_features = 'sqrt',
     n_jobs=-1, ccp_alpha=0.01, random_state=rndm_ste, n_estimators=700) 
 
    
    sme = SMOTE(random_state=rndm_ste, sampling_strategy=0.7, n_jobs=-1, k_neighbors=12)
    X_train_oversampled, y_train_oversampled = sme.fit_resample(X_train, y_train)
    # X_train_oversampled, y_train_oversampled = X_train, y_train # can be used to pass smote if needed for an experiment
    rfc.fit(X_train_oversampled,y_train_oversampled)


    

    if first_frame:  # Initialize 
        first_frame = False  # Don't Come back Here
        
        datadict = {'true':y_test.to_numpy(), 'estimate':rfc.predict(X_test), 'probability':rfc.predict_proba(X_test)[:, 1]}
        
        correctness_frame = pd.DataFrame(data=datadict)
        correctness_frame['round'] = i

        metrics_dict = {'AUC':metrics.roc_auc_score(y_test, rfc.predict_proba(X_test)[:, 1]),
        'Accuracy':rfc.score(X_test, y_test), 'Recall':recall_score(y_test, rfc.predict(X_test)), 
        'Precision':precision_score(y_test, rfc.predict(X_test)), 'F1':f1_score(y_test, rfc.predict(X_test))}
        
        metrics_frame = pd.DataFrame.from_dict(data=metrics_dict,orient='index').transpose()
        metrics_frame['Round'] = i

        # can be used if you want to track prediction during shuffle split - saves in another cell
        predictions = pd.DataFrame()
        predictions['Protein Name'] = df_local_protein_names_classify
        predictions['In Corona Probability'] = rfc.predict_proba(df_local_features_classify)[:, 1]
        predictions['Round'] = i
        predictions['Test Accuracy'] = metrics_dict['Accuracy']
        predictions['Test Recall'] = metrics_dict['Recall']
        predictions['Test Precision'] = metrics_dict['Precision']
        predictions['Test AUC'] = metrics_dict['AUC']

        
    else:
        datadict = {'true':y_test.to_numpy(), 'estimate':rfc.predict(X_test), 'probability':rfc.predict_proba(X_test)[:, 1]}
        revolve_frame = pd.DataFrame(data=datadict)
        revolve_frame['round'] = i
        correctness_frame = correctness_frame.append(revolve_frame, ignore_index=True)

        metrics_dict = {'AUC':metrics.roc_auc_score(y_test, rfc.predict_proba(X_test)[:, 1]),
        'Accuracy':rfc.score(X_test, y_test), 'Recall':recall_score(y_test, rfc.predict(X_test)), 
        'Precision':precision_score(y_test, rfc.predict(X_test)), 'F1':f1_score(y_test, rfc.predict(X_test))}
        metrics_revolve_frame = pd.DataFrame.from_dict(data=metrics_dict, orient='index').transpose()
        metrics_revolve_frame['Round'] = i
        metrics_frame = metrics_frame.append(metrics_revolve_frame, ignore_index=True)

        # can be used if you want to track prediction during shuffle split - saves in another cell
        pred_rev = pd.DataFrame()
        pred_rev['Protein Name'] = df_local_protein_names_classify
        pred_rev['In Corona Probability'] = rfc.predict_proba(df_local_features_classify)[:, 1]
        pred_rev['Round'] = i
        pred_rev['Test Accuracy'] = metrics_dict['Accuracy']
        pred_rev['Test Recall'] = metrics_dict['Recall']
        pred_rev['Test Precision'] = metrics_dict['Precision']
        pred_rev['Test AUC'] = metrics_dict['AUC']

        predictions = predictions.append(pred_rev, ignore_index=True)


    
    feature_imp.loc[i] = pd.Series(rfc.feature_importances_,index=list(df_local_features_train.columns))
    
    i += 1

In [238]:
# displays results
metrics_frame.mean()

AUC           0.757708
Accuracy      0.775556
Recall        0.646667
Precision     0.694506
F1            0.646936
Round        49.500000
dtype: float64

## Collect Revolving Predictions (if applicable)

This is a feature that is not used in the manuscript 

In [None]:
unique_names = []
for i in predictions['Protein Name']:
    if i not in unique_names:
        unique_names.append(i)

protein_avg_predictions = pd.DataFrame()
counter = 0
for i in unique_names:
    avg_df = predictions[predictions['Protein Name'] == i]

    if counter == 0:
        protein_avg_predictions = pd.DataFrame([i, round(avg_df['In Corona Probability'].mean(), 3), round(confidence_interval(avg_df['In Corona Probability']), 3)], index=['Protein Name', 'Average In Corona Probability', '95 Percent Confidence Interval']).transpose()
        
    else:
        pap_df = pd.DataFrame([i, round(avg_df['In Corona Probability'].mean(), 3), round(confidence_interval(avg_df['In Corona Probability']), 3)], index=['Protein Name', 'Average In Corona Probability', '95 Percent Confidence Interval']).transpose()
        protein_avg_predictions = protein_avg_predictions.append(pap_df, ignore_index=True)
    
    counter += 1

protein_avg_predictions

In [None]:
pct_correct = []


for i in range(10):
    subset = correctness_frame[correctness_frame.probability>= i *.1]
    subset = subset[subset.probability <(i+1)*.1]

#     subset = correctness_frame[correctness_frame.probability>=i]
    subset['correct'] = subset['true'] == subset['estimate']
    pct_correct.append(subset.correct.sum() / subset.shape[0])
    
bar_names = ['[' + str(np.around((i-1)*.1, decimals=1)) + ', ' + str(np.around((i)*.1, decimals=1)) +')'  for i in range(1,11)]
print(pct_correct, bar_names)#, steps)
#subset
fig= plt.figure(figsize=(10,5))
sns.barplot(x=bar_names, y=pct_correct, ci=None)

plt.show()

In [12]:
overall_probability_accuracy = pd.DataFrame([pct_correct], columns=bar_names)
metrics_frame = metrics_frame.append(pd.DataFrame({"AUC":[metrics_frame.AUC.mean(), confidence_interval(metrics_frame.AUC)], "Accuracy":[metrics_frame.Accuracy.mean(), confidence_interval(metrics_frame.Accuracy)], "Precision":[metrics_frame.Precision.mean(), confidence_interval(metrics_frame.Precision)],'Round':['Average', '.95 CI'], 'Recall':[metrics_frame.Recall.mean(), confidence_interval(metrics_frame.Recall)], 'F1':[metrics_frame.F1.mean(), confidence_interval(metrics_frame.F1)]}), ignore_index=True)

# Make Predictions Using the Entire Saved Dataset

Ensure that you are using the right k values and data files here

In [250]:
X_train_total = df_local_features_train_copy.copy() #df_local_features_train #df_local_features_train_copy
y_train_total = df_local_names #df_local_names#df_local_names_copy
rndm_ste = 2016
k_best = SelectKBest(f_classif, k=38)
fit = k_best.fit(X_train_total, y_train_total)
X_new = fit.transform(X_train_total)
 #X_train_set.values #
X_train_total = pd.DataFrame(X_new.copy())

rfc=RandomForestClassifier(criterion='entropy', min_impurity_decrease = 0.02,  min_samples_split=2, max_depth = 10, max_features = 'sqrt',
     n_jobs=-1, ccp_alpha=0.01, random_state=rndm_ste, n_estimators=700)   
sme = SMOTE(random_state=2016, sampling_strategy=0.7, n_jobs=-1, k_neighbors=12)
X_train_oversampled, y_train_oversampled = sme.fit_resample(X_train_total, y_train_total)
rfc.fit(X_train_oversampled,y_train_oversampled)


total_train_test = pd.DataFrame()
total_train_test['Protein Name'] = df_local_protein_names_classify_copy
# pd.DataFrame(fit.transform(df_local_features_classify_copy))
# total_train_test['In Corona Probability'] = rfc.predict_proba(df_local_features_classify_copy)[:, 1]

print(len(k_best.get_support()), df_local_features_train_copy.shape)
total_train_test['In Corona Probability'] = rfc.predict_proba(pd.DataFrame(df_local_features_classify_copy.loc[:,k_best.get_support()]))[:, 1]

91 (174, 91)


In [126]:
# print weights for table s2
pd.Series(rfc.feature_importances_,index=list(df_local_features_train_copy.columns)).sort_values(ascending=False).to_excel('revisions_data_2/table_s2_weights.xlsx')

In [230]:
# for full dataset testing scoire
y_test_score = y_test_test 
X_score = df_local_features_classify_copy.loc[:,k_best.get_support()]

results_dict = {'AUC':metrics.roc_auc_score(y_test_score, rfc.predict_proba(X_score)[:, 1]),
        'Accuracy':rfc.score(X_score, y_test_score), 'Recall':recall_score(y_test_score, rfc.predict(X_score)), 
        'Precision':precision_score(y_test_score, rfc.predict(X_score)), 'F1':f1_score(y_test_score, rfc.predict(X_score))}

pprint(results_dict)
count_proxy  = total_train_test.copy()
count_proxy['In Corona'] = count_proxy['In Corona Probability'] >= 0.5 


{'AUC': 0.8479166666666667,
 'Accuracy': 0.8064516129032258,
 'F1': 0.7777777777777778,
 'Precision': 0.9545454545454546,
 'Recall': 0.65625}
                                         Protein Name  In Corona Probability  \
1            Transthyretin (ATTR) (Prealbumin) (TBPA)               0.841727   
3   Prostaglandin-H2 D-isomerase (EC 5.3.99.2) (Be...               0.764176   
8   Immunoglobulin heavy constant gamma 1 (Ig gamm...               0.764373   
11                   Hemopexin (Beta-1B-glycoprotein)               0.600140   
12                           Apolipoprotein E (Apo-E)               0.810779   
14  Apolipoprotein A-I (Apo-AI) (ApoA-I) (Apolipop...               0.788691   
15  Clusterin (Aging-associated gene 4 protein) (A...               0.882423   
17  Gelsolin (AGEL) (Actin-depolymerizing factor) ...               0.524507   
19  Complement C3 (C3 and PZP-like alpha-2-macrogl...               0.875599   
40                         Collagen alpha-2(XI) chain     

In [251]:
# display results
total_train_test 

Unnamed: 0,Protein Name,In Corona Probability
0,Transgelin (22 kDa actin-binding protein) (Pro...,0.549758
1,TAR DNA-binding protein 43 (TDP-43),0.535928
2,CD44 antigen (CDw44) (Epican) (Extracellular m...,0.588761
3,"Lysozyme C (EC 3.2.1.17) (1,4-beta-N-acetylmur...",0.352303
4,L-lactate dehydrogenase A chain (LDH-A) (EC 1....,0.247507
5,Ribonuclease pancreatic (EC 4.6.1.18) (HP-RNas...,0.30131
6,Glutathione S-transferase (EC 2.5.1.18) (PfGST),0.119809
7,Syntenin-1 (Melanoma differentiation-associate...,0.329141


## Writes all Prediction Data (Including Revolving Predictions Data)

In [27]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('2021_08_08_predictions_selected_proteins_gt6.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.

predictions.to_excel(writer, sheet_name='Round Based Prediction')
metrics_frame.to_excel(writer, sheet_name='Classifier Round Metrics')
protein_avg_predictions.to_excel(writer, sheet_name='Protein Average Predictions')
overall_probability_accuracy.to_excel(writer, sheet_name='Overall Probability Accuracy')
total_train_test.to_excel(writer, sheet_name='Total Set Used in Prediction')


# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [None]:
count_proxy  = total_train_test.copy()
count_proxy['In Corona'] = count_proxy['In Corona Probability'] >= 0.5 
print(count_proxy['In Corona'].sum())

# Feature Add In Importance Test (Can take a very long time to run)

In [None]:
k_readouts ={}
first_loop = True
first_feat=True
kselect_params={}
trials = 100


rndm_ste=2016

sss = StratifiedShuffleSplit(n_splits=trials, test_size=0.1, random_state=rndm_ste)

feature_imp = pd.DataFrame(columns=list(df_local_features_train_copy.columns))
for k_feat in range(45, 51, 1):   
    i = 0

    k_best = SelectKBest(f_classif, k=k_feat)
    fit = k_best.fit(df_local_features_train_copy.copy(), df_local_names.copy())
    X_new = fit.transform(df_local_features_train_copy.copy()) #X_train_set.values #
    df_local_features_train = pd.DataFrame(X_new.copy())
    
    init_scores_rfc = True
    first_roc_rfc = True
    # kselect_params[str(k_feat)]=k_best.get_support()
    for train_index, test_index in sss.split(df_local_features_train, df_local_names):
    
        X_train = df_local_features_train.iloc[train_index]
        X_test = df_local_features_train.iloc[test_index]
        y_train = df_local_names.iloc[train_index]
        y_test = df_local_names.iloc[test_index]    
        #### END COMMENT OUT 
        if init_scores_rfc:  # use this to record data for ROC curves -- Some may be moved to outside the loop
            y_score_array_rfc = np.zeros((y_test.shape[0], trials))
            y_true_array_rfc = np.zeros((y_test.shape[0], trials))
            tpr_array_rfc = np.zeros((y_test.shape[0], trials))
            fpr_array_rfc = np.zeros((y_test.shape[0], trials))
            score_rfc = np.zeros(trials)
            score_svm = np.zeros(trials)
            auc_data_rfc = np.zeros(trials)
            f1_rfc = np.zeros(trials)
            fbeta_rfc = np.zeros(trials)
            recall_rfc = np.zeros(trials)
            precision_rfc = np.zeros(trials)
            log_loss_rfc = np.zeros(trials)
            features_left = np.zeros(trials)
            
            init_scores_rfc = False # Don't Come Back Here
        
        # Create and Train
        rfc=RandomForestClassifier(criterion='entropy', min_impurity_decrease = 0.02,  min_samples_split=2, max_depth = 10, max_features = 'sqrt',
                                    n_jobs=-1, ccp_alpha=0.01, random_state=rndm_ste, n_estimators=700)
                                #min_samples_split=4, min_samples_leaf= 2, max_features= 'log2', max_depth = 10)    ## max_leaf_nodes=20, 
        
        
        sme = SMOTE(random_state=rndm_ste, sampling_strategy=.7, n_jobs=-1, k_neighbors=12)
        X_train_oversampled, y_train_oversampled = sme.fit_resample(X_train, y_train)
        
        rfc.fit(X_train_oversampled,y_train_oversampled)

        # Basic Predictions
        y_pred_test = rfc.predict(X_train_oversampled) 
        y_pred_train = rfc.predict(X_train)
        

        
        # Calculate Metrics
        auc_data_rfc[i] = metrics.roc_auc_score(y_test, rfc.predict_proba(X_test)[:, 1])
        score_rfc[i] =  rfc.score(X_test, y_test)

        f1_rfc[i] = f1_score(y_test, rfc.predict(X_test))
        fbeta_rfc[i] = fbeta_score(y_test, rfc.predict(X_test), beta=0.5)
        recall_rfc[i] = recall_score(y_test, rfc.predict(X_test))
        precision_rfc[i] = precision_score(y_test, rfc.predict(X_test))
        log_loss_rfc[i] = log_loss(y_test, rfc.predict_proba(X_test)[:, 1])
        fpr_current_list, tpr_current_list, _ = metrics.roc_curve(y_test, rfc.predict_proba(X_test)[:, 1])


        if first_roc_rfc:  # Initialize 
            fpr_array_rfc = fpr_current_list
            tpr_array_rfc = tpr_current_list
            first_roc_rfc = False  # Don't Come back Here
            
        else:
            fpr_array_rfc = np.concatenate((fpr_array_rfc, fpr_current_list))
            tpr_array_rfc = np.concatenate((tpr_array_rfc, tpr_current_list))
#         i+=1
        column_list = list(scaled_df.columns)
        
        if first_feat:
            feat_revolve = pd.DataFrame(rfc.feature_importances_,index=[column_list[i] for i in np.nonzero(k_best.get_support())[0]]).transpose()
            feat_revolve['Features'] = k_feat
            feature_imp = feat_revolve.copy()
            first_feat = False
            
            
        else:
            feat_revolve = pd.DataFrame(rfc.feature_importances_,index=[column_list[i] for i in np.nonzero(k_best.get_support())[0]]).transpose()
            feat_revolve['Features'] = k_feat
            feature_imp = feature_imp.append(feat_revolve.copy(), ignore_index=True)

        kselect_params[str(k_feat)]=k_best.get_support()
        
        i+=1

    k_readouts[str(k_feat)] = {'Accuracy rfc': score_rfc.mean(),
                               'Accuracy rfc ci': confidence_interval(score_rfc),
                               'ROC Score rfc': auc_data_rfc.mean(), 
                               'ROC Score rfc ci': confidence_interval(auc_data_rfc),
                               'Precision rfc': precision_rfc.mean(),
                               'Precision rfc ci': confidence_interval(precision_rfc),
                               'Recall rfc': recall_rfc.mean(),
                               'Recall rfc ci': confidence_interval(recall_rfc),
                               'Accuracy svm': score_svm.mean(),
                               "Accuracy svm ci" : confidence_interval(score_svm)}
    print(f'K Criteria: {k_feat}\nAccuracy: {score_rfc.mean():.03f} +/- {confidence_interval(score_rfc):.03f}\nROC Score: {auc_data_rfc.mean():.03f} +/- {confidence_interval(auc_data_rfc):.03f} \n Precision: {precision_rfc.mean():.03f} \nRecall: {recall_rfc.mean():.03f}') #Features Left: {features_left.mean():.1f} +/- {confidence_interval(features_left):.02f}')

In [198]:
first_key = True
for key in k_readouts.keys():
    
    if first_key:
        k_feats_data = pd.DataFrame.from_dict(data=k_readouts[key],orient='index').transpose()
        k_feats_data['K Features'] = int(key)
        first_key=False
        
    else: 
        k_feats_data_rotating = pd.DataFrame.from_dict(data=k_readouts[key],orient='index').transpose()
        k_feats_data_rotating['K Features'] = int(key)
        k_feats_data = k_feats_data.append(k_feats_data_rotating, ignore_index=True)
        
k_feats_data = k_feats_data.set_index('K Features')
k_feats_data.to_excel('data_for_feature_add_in_figure_example.xlsx')        

In [None]:

## prints features in order of importance (use all features for best results)
first_pass = True
for key, value in kselect_params.items():
    if first_pass:
        print(key, list(df_local_features_train_copy.loc[:,value].columns)[0])
        prev_list = list(df_local_features_train_copy.loc[:,value].columns)
        first_pass = False
    else: 
        for i in list(df_local_features_train_copy.loc[:,value].columns):
            if i not in prev_list:
                print(key, i)
                prev_list = list(df_local_features_train_copy.loc[:,value].columns)


