In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import sys
import sklearn.metrics
import numpy.random
import pymrmr
import scipy.stats as ss

class ML_DataLoader(object):
    np.random.seed(1)
    def __init__(self,data_path,label_path):
        self.data_path=data_path
        self.label_path = label_path
        self.full_data=pd.read_excel(data_path)
        self.full_label=pd.read_excel(label_path)
        
        self.data=self.full_data.iloc[:,1::].to_numpy()
        self.labels=self.full_label.iloc[:,1].to_numpy()
        
        self.positive_indices=[]
        self.negative_indices=[]
        self.positive_test_indices=[]
        self.positive_train_indices=[]
        self.negative_test_indices=[]
        self.negative_train_indices=[]
        
    def createFolds(self,n_folds):
        
        pos_is_maj=0
        self.positive_indices=np.where(self.labels==1)[0]
        self.negative_indices=np.where(self.labels==0)[0]
        if self.positive_indices.shape[0]>self.negative_indices.shape[0]:
            pos_is_maj=1
            
        if pos_is_maj:
            one_fold_size=self.negative_indices.shape[0]/n_folds   
        else:
            one_fold_size=self.positive_indices.shape[0]/n_folds
        train_size=np.math.ceil((n_folds-1)*one_fold_size)

        self.positive_train_indices=np.random.choice(self.positive_indices,[train_size,1],replace=False)[:,0]
        self.negative_train_indices=np.random.choice(self.negative_indices,[train_size,1],replace=False)[:,0]       
        self.positive_test_indices=np.setdiff1d(np.union1d(self.positive_train_indices,self.positive_indices),np.intersect1d(self.positive_train_indices,self.positive_indices))
        self.negative_test_indices=np.setdiff1d(np.union1d(self.negative_train_indices,self.negative_indices),np.intersect1d(self.negative_train_indices,self.negative_indices))
        if pos_is_maj:
            self.positive_test_indices=np.random.choice(self.positive_test_indices,self.negative_test_indices.size,replace=False)
        elif not pos_is_maj:
            self.negative_test_indices=np.random.choice(self.negative_test_indices,self.positive_test_indices.size,replace=False)       
        
        featnames=self.full_data.columns[1::]
        labelnames=[]
        labelnames.append(self.full_label.columns[1])

        train=np.concatenate((self.data[self.positive_train_indices,:],self.data[self.negative_train_indices,:]),0)
        test=np.concatenate((self.data[self.positive_test_indices,:],self.data[self.negative_test_indices,:]),0)
        train_labels=np.concatenate((self.labels[self.positive_train_indices],self.labels[self.negative_train_indices]),0)
        test_labels=np.concatenate((self.labels[self.positive_test_indices],self.labels[self.negative_test_indices]),0)
        
        train=pd.DataFrame(train,columns=featnames)
        test=pd.DataFrame(test,columns=featnames)
        train_labels=pd.DataFrame(train_labels,columns=labelnames)
        test_labels=pd.DataFrame(test_labels,columns=labelnames)
        return train,test,train_labels,test_labels  
    
def RemoveCorrFeatures(data, thresh=.9):
    corr=data.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= thresh:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    return data


def PerformFeatureSelection(Data, Labels, n=5, Methods=("MID","MIQ","ranksum","ttest"), p=.05):
    """ 
        Takes a dataframe as input and returns dict of features
        Uses the 'full_data' attribute of ML_DataLoader. 
    """
  
    DesiredFeatureNames=dict()
    mrmrData=Labels.join(Data)
    for column in mrmrData.columns:
        thresh=mrmrData[column].mean()
        mrmrData[column]=(mrmrData[column]>=thresh).astype(int)
    
    if "MID" in Methods:
        DesiredFeatureNames["MID"]=np.asarray(pymrmr.mRMR(mrmrData,"MID",n)).astype(str)
    if "MIQ" in Methods:  
        DesiredFeatureNames["MIQ"]=np.asarray(pymrmr.mRMR(mrmrData,"MIQ",n)).astype(str)
    if "ranksum" in Methods:
        Data=RemoveCorrFeatures(Data)
        p_values=[]
        featurenames=[]
        positiveClass=Data.loc[Labels.iloc[:,0]==1,:]
        #print(positiveClass)
        negativeClass=Data.loc[Labels.iloc[:,0]==0,:]
        #print(negativeClass)
        for i in range(positiveClass.shape[1]):
            calc_p=ss.ranksums(positiveClass.iloc[:,i],negativeClass.iloc[:,i])[1]
            if calc_p<=p:
                p_values.append(calc_p)
                featurenames.append(positiveClass.columns[i])
        d={'p_values':p_values,'featnames':featurenames}
        ranksumdf=pd.DataFrame(d)
        sorteddf=ranksumdf.sort_values(by='p_values')
        DesiredFeatureNames["ranksum"]=sorteddf.iloc[0:n,1].to_numpy().astype(str)

    if "ttest" in Methods:
        Data=RemoveCorrFeatures(Data)
        p_values=[]
        featurenames=[]
        positiveClass=Data.loc[Labels.iloc[:,0]==1,:]

        negativeClass=Data.loc[Labels.iloc[:,0]==0,:]
        for i in range(positiveClass.shape[1]):
            calc_p=ss.ttest_ind(positiveClass.iloc[:,i],negativeClass.iloc[:,i])[1]
            if calc_p<=p:
                p_values.append(calc_p)
                featurenames.append(positiveClass.columns[i])
        dt={'p_values':p_values,'featnames':featurenames}
        ttestdf=pd.DataFrame(dt)
        ttestsorteddf=ttestdf.sort_values(by='p_values')
        DesiredFeatureNames["ttest"]=ttestsorteddf.iloc[0:n,1].to_numpy().astype(str)

    return DesiredFeatureNames

path_label = 'E:/Data2/BreastMass_refine/outcome.xlsx'
path_feat = 'E:/Data2/BreastMass_refine/radiomic_features/radiomic_features_flag_normalize_false.xlsx'

data_loader = ML_DataLoader(path_feat, path_label)
iterations=2
folds=5
desiredClassifiers=['lda','qda']
selectionTypes=("MID","MIQ","ranksum","ttest")
num_feats=93

LDAStats=pd.DataFrame(columns=['sensitivity','sens_std','specificity','spec_std','auc','auc_std'])
QDAStats=pd.DataFrame(columns=['sensitivity','sens_std','specificity','spec_std','auc','auc_std'])
RFStats=pd.DataFrame(columns=['sensitivity','sens_std','specificity','spec_std','auc','auc_std'])
MIDFeats=pd.DataFrame(columns=['MID_Feats','MID_counts'])
MIQFeats=pd.DataFrame(columns=["MIQ_Feats",'MIQ_counts'])
RanksumFeats=pd.DataFrame(columns=["RS_Feats",'RS_counts'])
TtestFeats=pd.DataFrame(columns=["Ttest_Feats",'Ttest_counts'])
[train_set,test_set,train_labels,test_labels]=data_loader.createFolds(5) #really lazy to get feature names
totalFeatNames=train_set.columns
counts=np.zeros(totalFeatNames.shape)
td={ 'feats':totalFeatNames, 'counts':counts}
MID_featcounts=pd.DataFrame(td,index=td['feats'])
MIQ_featcounts=pd.DataFrame(td,index=td['feats'])
RS_featcounts=pd.DataFrame(td,index=td['feats'])
Ttest_featcounts=pd.DataFrame(td,index=td['feats'])

for selection_type in (selectionTypes):
    lda_sens_ave=[]
    lda_sens_std=[]
    lda_spec_ave=[]
    lda_spec_std=[]
    lda_auc_ave=[]
    lda_auc_std=[]

    qda_sens_ave=[]
    qda_sens_std=[]
    qda_spec_ave=[]
    qda_spec_std=[]
    qda_auc_ave=[]
    qda_auc_std=[]


    rf_sens_ave=[]
    rf_sens_std=[]
    rf_spec_ave=[]
    rf_spec_std=[]
    rf_auc_ave=[]
    rf_auc_std=[]

    lda_sensitivity=np.ones([num_feats,iterations])

    lda_specificity=np.ones([num_feats,iterations])

    lda_auc=np.ones([num_feats,iterations])

    qda_sensitivity=np.ones([num_feats,iterations])

    qda_specificity=np.ones([num_feats,iterations])

    qda_auc=np.ones([num_feats,iterations])

    rf_sensitivity=np.ones([num_feats,iterations])

    rf_specificity=np.ones([num_feats,iterations])

    rf_auc=np.ones([num_feats,iterations])

    for iteration in range(iterations):
        print('iteration',iteration+1)
        [train_set,test_set,train_labels,test_labels]=data_loader.createFolds(folds)
        feat_dict=PerformFeatureSelection(train_set,train_labels,n=num_feats,Methods=(selection_type))
        
        if selection_type=="MID":
            MID_featcounts.loc[feat_dict[selection_type],'counts']=MID_featcounts.loc[feat_dict[selection_type],'counts']+1
        if selection_type=="MIQ":
            MIQ_featcounts.loc[feat_dict[selection_type],'counts']=MIQ_featcounts.loc[feat_dict[selection_type],'counts']+1
        if selection_type=="ranksum":
            RS_featcounts.loc[feat_dict[selection_type],'counts']=RS_featcounts.loc[feat_dict[selection_type],'counts']+1
        if selection_type=="ttest":
            Ttest_featcounts.loc[feat_dict[selection_type],'counts']=Ttest_featcounts.loc[feat_dict[selection_type],'counts']+1
        train_labels=np.ravel(train_labels.to_numpy())
        test_labels=test_labels.to_numpy()
        
        for current_feats in range(num_feats):
            trimmed_train_set=train_set.loc[:,feat_dict[selection_type][0:current_feats+1]].to_numpy() 
            trimmed_test_set=test_set.loc[:,feat_dict[selection_type][0:current_feats+1]].to_numpy()
            print(trimmed_train_set.shape)
            print(trimmed_test_set.shape)
            
            if 'lda' in desiredClassifiers:
                try:
                    lda=LinearDiscriminantAnalysis()
                    lda_model=lda.fit(trimmed_train_set,train_labels)
                    lda_pred_prob=lda_model.predict_proba(trimmed_test_set)
                    lda_pred=lda_model.predict(trimmed_test_set)
                    lda_conf=confusion_matrix(test_labels,lda_pred)
                    lda_sensitivity[current_feats,iteration]=(lda_conf[0,0]/(lda_conf[0,0]+lda_conf[0,1]))
                    lda_specificity[current_feats,iteration]=(lda_conf[1,1]/(lda_conf[1,0]+lda_conf[1,1]))
                    lda_auc[current_feats,iteration]=(sklearn.metrics.roc_auc_score(test_labels,lda_pred_prob[:,1]))
                except:
                    lda_sensitivity[current_feats,iteration]=float('nan')
                    lda_specificity[current_feats,iteration]=float('nan')
                    lda_auc[current_feats,iteration]=float('nan')
            if 'qda' in desiredClassifiers:
                try:
                    qda=QuadraticDiscriminantAnalysis()
                    qda_model=qda.fit(trimmed_train_set,train_labels)
                    qda_pred_prob=qda_model.predict_proba(trimmed_test_set)
                    qda_pred=qda_model.predict(trimmed_test_set)
                    qda_conf=confusion_matrix(test_labels,qda_pred)
                    qda_sensitivity[current_feats,iteration]=(qda_conf[0,0]/(qda_conf[0,0]+qda_conf[0,1]))
                    qda_specificity[current_feats,iteration]=(qda_conf[1,1]/(qda_conf[1,0]+qda_conf[1,1]))
                    qda_auc[current_feats,iteration]=(sklearn.metrics.roc_auc_score(test_labels,qda_pred_prob[:,1]))
                except:
                    qda_sensitivity[current_feats,iteration]=float('nan')
                    qda_specificity[current_feats,iteration]=float('nan')
                    qda_auc[current_feats,iteration]=float('nan')
            if 'rf' in desiredClassifiers:        
                rf=RandomForestClassifier(n_estimators=100)
                rf.fit(trimmed_train_set,train_labels)
                rf_pred_prob=rf.predict_proba(trimmed_test_set)
                rf_pred=rf.predict(trimmed_test_set)
                rf_conf=confusion_matrix(test_labels,rf_pred)
                rf_sensitivity[current_feats,iteration]=(rf_conf[0,0]/(rf_conf[0,0]+rf_conf[0,1]))
                rf_specificity[current_feats,iteration]=(rf_conf[1,1]/(rf_conf[1,0]+rf_conf[1,1]))
                rf_auc[current_feats,iteration]=(sklearn.metrics.roc_auc_score(test_labels,rf_pred_prob[:,1]))
                
if 'lda' in desiredClassifiers:
    lda_labels=[];
    for i in range(num_feats):
        lda_labels.append('lda '+selection_type+" "+str(i+1))
        try: 
            lda_sens_ave.append(np.nanmean(lda_sensitivity[i,:]))
            lda_sens_std.append(np.nanstd(lda_sensitivity[i,:]))
            lda_spec_ave.append(np.nanmean(lda_specificity[i,:]))
            lda_spec_std.append(np.nanstd(lda_specificity[i,:]))
            lda_auc_ave.append(np.nanmean(lda_auc[i,:]))
            lda_auc_std.append(np.nanstd(lda_auc[i,:]))
        except:
            lda_sens_ave.append(float("nan"))
            lda_sens_std.append(float("nan"))
            lda_spec_ave.append(float("nan"))
            lda_spec_std.append(float("nan"))
            lda_auc_ave.append(float("nan"))
            lda_auc_std.append(float("nan"))          
    lda_scores=pd.DataFrame([lda_sens_ave,lda_sens_std,lda_spec_ave,lda_spec_std,lda_auc_ave,lda_auc_std],columns=lda_labels).T
    lda_scores.columns=['sensitivity','sens_std','specificity','spec_std','auc','auc_std']
    LDAStats=LDAStats.append(lda_scores)
    
if 'qda' in desiredClassifiers:
    qda_labels=[]    
    for i in range(num_feats):
        qda_labels.append('qda '+selection_type+" "+str(i+1))
        try: 
            qda_sens_ave.append(np.nanmean(qda_sensitivity[i,:]))
            qda_sens_std.append(np.nanstd(qda_sensitivity[i,:]))
            qda_spec_ave.append(np.nanmean(qda_specificity[i,:]))
            qda_spec_std.append(np.nanstd(qda_specificity[i,:]))
            qda_auc_ave.append(np.nanmean(qda_auc[i,:]))
            qda_auc_std.append(np.nanstd(qda_auc[i,:]))
        except:
            qda_sens_ave.append(float("nan"))
            qda_sens_std.append(float("nan"))
            qda_spec_ave.append(float("nan"))
            qda_spec_std.append(float("nan"))
            qda_auc_ave.append(float("nan"))
            qda_auc_std.append(float("nan"))
    qda_scores=pd.DataFrame([qda_sens_ave,qda_sens_std,qda_spec_ave,qda_spec_std,qda_auc_ave,qda_auc_std],columns=qda_labels).T
    qda_scores.columns=['sensitivity','sens_std','specificity','spec_std','auc','auc_std']
    QDAStats=QDAStats.append(qda_scores)
    
if 'rf' in desiredClassifiers:
    rf_labels=[]    
    for i in range(num_feats):
        rf_labels.append('rf '+selection_type+" "+str(i+1))

        rf_sens_ave.append(np.nanmean(rf_sensitivity[i,:]))
        rf_sens_std.append(np.nanstd(rf_sensitivity[i,:]))
        rf_spec_ave.append(np.nanmean(rf_specificity[i,:]))
        rf_spec_std.append(np.nanstd(rf_specificity[i,:]))
        rf_auc_ave.append(np.nanmean(rf_auc[i,:]))
        rf_auc_std.append(np.nanstd(rf_auc[i,:]))

    rf_scores=pd.DataFrame([rf_sens_ave,rf_sens_std,rf_spec_ave,rf_spec_std,rf_auc_ave,rf_auc_std],columns=rf_labels).T
    rf_scores.columns=['sensitivity','sens_std','specificity','spec_std','auc','auc_std']
    RFStats=RFStats.append(rf_scores)
    
MID_featcounts=MID_featcounts.sort_values(by='counts',ascending=False)
MIQ_featcounts=MIQ_featcounts.sort_values(by='counts',ascending=False)
RS_featcounts=RS_featcounts.sort_values(by='counts',ascending=False)
Ttest_featcounts=Ttest_featcounts.sort_values(by='counts',ascending=False)