In [1]:
import sys
sys.path.append('../sgmm')
sys.path.append('../metrics')
sys.path.append('../Misc')
sys.path.append('../visual')
sys.path.append('../otherModels')
sys.path.append('../LogOdds')

In [2]:
import numpy as np
import pandas as pd
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from supervisedGmm import SupervisedGMM
from metricsFunctions import calc_metrics, CalculateSoftLogReg, optimalTau,metrics_cluster,sgmmResults
from mlModels import logisticRegressionCv2, neural_nets, randomforests,\
kmeansLogRegr, xboost, gradboost
from sklearn.naive_bayes import BernoulliNB
from supervisedBmm import SupervisedBMM
from utility import entropy,asymm_entropy,purity
from ftest_logodds import ftest_uncorr
from ftest_logodds import restest
#from clustmap import plotclustmap
from clustmap_newborn import plotclustmap
from sklearn.linear_model import LogisticRegression

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
sparcs = pd.read_csv("~/data/CDPHP/xiao/SPARCS_Subsets/Obsolete/sparcs25%Preg_DeHos_Outflow_Region.csv") 

d_preg_tr, d_preg_te = train_test_split(sparcs, test_size=0.2, random_state = 1512)

print(d_preg_tr.shape[0], d_preg_tr.shape[1])

100679 244


In [5]:
print(sparcs.shape[0])

125849


In [6]:
# metric matrix columns
columns = ['cluster', 'size', 'high_cost%','low_cost%', 
                       'TP', 'TN', 'FP', 'FN', 
                       'FPR', 'specificity', 'sensitivity', 'precision',
                       'accuracy', 'balanced accuracy', 'f1', 'auc']

In [7]:
# feature list
features = list(sparcs.iloc[:,:-1])

In [8]:
#Preparing the data
Xtrain, Xtest = d_preg_tr.iloc[:,0:-1].values, d_preg_te.iloc[:,0:-1].values
ytrain, ytest = d_preg_tr.iloc[:,-1].values.astype(int), d_preg_te.iloc[:,-1].values.astype(int)

# Nonclustering Classification

In [9]:
# Fitting a Bernoulli Naive Bayes
bnb = BernoulliNB(alpha= 1,class_prior=[0.75,0.25])
bnb.fit(Xtrain, ytrain)
probTrainNB,probTestNB = bnb.predict_proba(Xtrain)[:,1], bnb.predict_proba(Xtest)[:,1]
tau = optimalTau(probTrainNB, ytrain)
metTest,_ = calc_metrics(custom_prob = probTestNB.copy(), tau = tau, y = ytest)
metTrain ,_= calc_metrics(custom_prob = probTrainNB.copy(), tau = tau, y = ytrain)
metTestNB = pd.DataFrame( [metTest], columns = columns)
metTrainNB = pd.DataFrame( [metTrain], columns = columns)

In [10]:
#FITTING L1 LOGISTIC REGRESSION
Cs = [1,10,100,1000]
pL1, probTestL1, probTrainL1 = logisticRegressionCv2( Xtrain = Xtrain,
                                                  ytrain = ytrain,
                                                  Xtest = Xtest,
                                                  ytest = ytest, Cs = Cs )
tau = optimalTau(probTrainL1, ytrain)

metTest,_ = calc_metrics(custom_prob = probTestL1.copy(), tau = tau, y = ytest)
metTrain ,_= calc_metrics(custom_prob = probTrainL1.copy(), tau = tau, y = ytrain)
metTestL1 = pd.DataFrame( [metTest], columns = columns)
metTrainL1 = pd.DataFrame( [metTrain], columns = columns)

In [11]:
#Fitting Neural Nets
pNN, probTestNN, probTrainNN = neural_nets( Xtrain = Xtrain,
                                                  ytrain = ytrain,
                                                  Xtest = Xtest,
                                                  ytest = ytest,
                                                  h_l_s = (4 ,4, 2))
tau = optimalTau(probTrainNN, ytrain)

metTest,_ = calc_metrics(custom_prob = probTestNN.copy(), tau = tau, y = ytest)
metTrain ,_= calc_metrics(custom_prob = probTrainNN.copy(), tau = tau, y = ytrain)
metTestNN = pd.DataFrame( [metTest], columns = columns)
metTrainNN = pd.DataFrame( [metTrain], columns = columns)

In [12]:
#RANDOM FORESTS
params, probTest, probTrain = randomforests(Xtrain = Xtrain, ytrain = ytrain,
                                            Xtest = Xtest, ytest = ytest)

tau = optimalTau(probTrain, ytrain)
metTest,_ = calc_metrics(custom_prob = probTest.copy(), tau = tau, y = ytest)
metTrain ,_= calc_metrics(custom_prob = probTrain.copy(), tau = tau, y = ytrain)

#PANDA MATRICES
metTestRF = pd.DataFrame( [metTest], columns = columns)
metTrainRF = pd.DataFrame( [metTrain], columns = columns)


In [13]:
#Ada boost
params, probTest, probTrain = xboost(Xtrain = Xtrain, ytrain = ytrain,
                                            Xtest = Xtest, ytest = ytest)

tau = optimalTau(probTrain, ytrain)
metTest,_ = calc_metrics(custom_prob = probTest.copy(), tau = tau, y = ytest)
metTrain ,_= calc_metrics(custom_prob = probTrain.copy(), tau = tau, y = ytrain)

#PANDA MATRICES
metTestXB = pd.DataFrame( [metTest], columns = columns)
metTrainXB = pd.DataFrame( [metTrain], columns = columns)

In [None]:
#Grad boost
params, probTest, probTrain = gradboost(Xtrain = Xtrain, ytrain = ytrain,
                                            Xtest = Xtest, ytest = ytest)

tau = optimalTau(probTrain, ytrain)
metTest,_ = calc_metrics(custom_prob = probTest.copy(), tau = tau, y = ytest)
metTrain ,_= calc_metrics(custom_prob = probTrain.copy(), tau = tau, y = ytrain)

#PANDA MATRICES
metTestGB = pd.DataFrame( [metTest], columns = columns)
metTrainGB = pd.DataFrame( [metTrain], columns = columns)

# Sequential clustering + classification

In [15]:
#Kmeans + LG
np.random.seed( seed = 0 )

n_clusters = 8

Cs = [1,10,100,1000]
kmeansParams = kmeansLogRegr(Xtrain = Xtrain, ytrain = ytrain, 
                             Xtest = Xtest, ytest = ytest, Cs = Cs, n_clusters = n_clusters)

modelsKM = kmeansParams['models']
labTrKM, labTestKM  = kmeansParams['labelsTrain'], kmeansParams['labelsTest']


# KMS class accuracy
metTrainKMc, metTestKMc = metrics_cluster(models = modelsKM, ytrain = ytrain,
                                        ytest = ytest, testlabels = labTestKM,
                                        trainlabels = labTrKM,
                                        Xtrain = Xtrain, Xtest = Xtest)

In [16]:
# KMS LG overall accuaracy
probTr= np.array([])
for i in range(len(kmeansParams['probTrain'])):
    probTr = np.append(probTr,kmeansParams['probTrain'][i])
yTr = np. array([])
for i in range(n_clusters):
    yTr = np.append(yTr,ytrain[labTrKM==i])

probTe= np.array([])
for i in range(len(kmeansParams['probTest'])):
    probTe = np.append(probTe,kmeansParams['probTest'][i])
yTe = np. array([])
for i in range(n_clusters):
    yTe = np.append(yTe,ytest[labTestKM==i])

tau = optimalTau(probTr, yTr)

metTrain ,__= calc_metrics(y = yTr, tau = tau, custom_prob = probTr)
metTest ,__= calc_metrics( y = yTe, tau = tau, custom_prob = probTe)
metTrainKMS = pd.DataFrame( [metTrain], columns = columns)
metTestKMS = pd.DataFrame( [metTest], columns = columns)

In [19]:
# train SGMM model with Log Regression
np.random.seed( seed = 3957 )

max_iter = 30
max_iter2 = 30
n_clusters = 8

model = SupervisedGMM(max_iter=max_iter, max_iter2 = max_iter2, n_clusters = n_clusters, verbose = 0)
model = model.fit(Xtrain = Xtrain, ytrain = ytrain)

# Retrieve memberships and labels
mTrainSGMM = model.mTrain
logisRegreSGMM = model.LogRegr
fitP = model.fitParams
labTrainSGMM  = fitP['labTrain']
mTestSGMM = model.predict_GMMS(Xtest)
labTestSGMM = np.argmax( mTestSGMM, axis = 1 )

# Summary of overall accuracy 
probTest, probTrain = model.predict_prob_int( Xtest = Xtest, Xtrain = Xtrain )
tau = optimalTau(probTrain, ytrain)
metTest,_ = calc_metrics(custom_prob = probTest.copy(), tau = tau, y = ytest)
metTrain ,_= calc_metrics(custom_prob = probTrain.copy(), tau = tau, y = ytrain)
metTestSGMM = pd.DataFrame( [metTest], columns = columns)
metTrainSGMM = pd.DataFrame( [metTrain], columns = columns)

# Cluster summary
metTrainSGc, metTestSGc = metrics_cluster(models = logisRegreSGMM, ytrain = ytrain,
                                        ytest = ytest, testlabels = labTestSGMM,
                                        trainlabels = labTrainSGMM,
                                        Xtrain = Xtrain, Xtest = Xtest)

GMM iteration: 0, error: 0.20276032428298818
GMM iteration: 1, error: 0.05967110690273707
GMM iteration: 2, error: 0.012208029408162877
GMM iteration: 3, error: 0.003032847519465272
GMM iteration: 4, error: 0.0006929119113580264


In [20]:
# train SBMM model with Log Regression
np.random.seed( seed = 7332  )

max_iter = 30
max_iter2 = 30
n_clusters = 8

modelB = SupervisedBMM( max_iter =max_iter, n_clusters = n_clusters, max_iter2 = max_iter2,verbose =0)
modelB = modelB.fitB( Xtrain = Xtrain, Xtest = Xtest, ytrain = ytrain)

mTrainSBMM = modelB.mTrain
logisRegreB = modelB.LogRegr
fitPB = modelB.fitParams
labTrainSBMM  = fitPB['labTrain']
mTestSBMM = modelB.predict_BMMS(Xtest)
labTestSBMM = np.argmax( mTestSBMM, axis = 1 )

probTestB, probTrainB = modelB.predict_prob_int_B( Xtest = Xtest, Xtrain = Xtrain )
tauB = optimalTau(probTrainB, ytrain)
metTestB,_ = calc_metrics(custom_prob = probTestB.copy(), tau = tauB, y = ytest)
metTrainB ,_= calc_metrics(custom_prob = probTrainB.copy(), tau = tauB, y = ytrain)
metTestSBMM = pd.DataFrame( [metTestB], columns = columns)
metTrainSBMM = pd.DataFrame( [metTrainB], columns = columns)

metTrainSBc, metTestSBc = metrics_cluster(models = logisRegreB, ytrain = ytrain,
                                        ytest = ytest, testlabels = labTestSBMM,
                                        trainlabels = labTrainSBMM,
                                        Xtrain = Xtrain, Xtest = Xtest)

BMM iteration: 0, error: 0.05956246079127828
BMM iteration: 1, error: 0.012337373483863457
BMM iteration: 2, error: 0.01266240618360524
BMM iteration: 3, error: 0.04061405027848652
BMM iteration: 4, error: 0.06480344942806185
BMM iteration: 5, error: 0.052282351250163464
BMM iteration: 6, error: 0.05402750929868193
BMM iteration: 7, error: 0.049680640240205895
BMM iteration: 8, error: 0.034007107757107045
BMM iteration: 9, error: 0.02152434792977026
BMM iteration: 10, error: 0.01639227210332293
BMM iteration: 11, error: 0.014915466318168277
BMM iteration: 12, error: 0.013945724166356397
BMM iteration: 13, error: 0.012211535194671557
BMM iteration: 14, error: 0.010027819705238875
BMM iteration: 15, error: 0.00770808272376263
BMM iteration: 16, error: 0.0055139656366097675
BMM iteration: 17, error: 0.003886611842106428
BMM iteration: 18, error: 0.0029741389947466968
BMM iteration: 19, error: 0.0022205666874090335
BMM iteration: 20, error: 0.0018110393985449494
BMM iteration: 21, error: 0

# Overall predition accuracy

In [22]:
# Comparison acucuracy with other methods
trainmet = pd.concat([metTrainNB, metTrainL1,metTrainNN,metTrainRF,metTrainXB,metTrainGB, metTrainKMS,metTrainSGMM,metTrainSBMM],ignore_index=True)
testmet = pd.concat([metTestNB, metTestL1,metTestNN,metTestRF,metTestXB,metTestGB, metTestKMS,metTestSGMM,metTestSBMM],ignore_index=True)
method = ['Naive Bayes','L1 Log Reg','Neural Network','Random Forest','AdaBoost','GradBoost','KMS + Log Reg', 'SGMM w/ Log Reg', 'SBMM w/ Log Reg']
trainmet.insert(8,'method',method)
testmet.insert(8,'method',method)

In [23]:
print( 'Overall prediction accuracy')
testmet.iloc[:,8:].round(2)

Overall prediction accuracy


Unnamed: 0,method,FPR,specificity,sensitivity,precision,accuracy,balanced accuracy,f1,auc
0,Naive Bayes,0.25,0.75,0.68,0.48,0.73,0.71,0.56,0.8
1,L1 Log Reg,0.25,0.75,0.71,0.49,0.74,0.73,0.58,0.82
2,Neural Network,0.18,0.82,0.71,0.58,0.8,0.77,0.64,0.85
3,Random Forest,0.13,0.87,0.65,0.64,0.82,0.76,0.64,0.86
4,AdaBoost,0.27,0.73,0.71,0.48,0.73,0.72,0.57,0.81
5,GradBoost,0.26,0.74,0.75,0.5,0.75,0.75,0.6,0.83
6,KMS + Log Reg,0.18,0.82,0.69,0.57,0.79,0.76,0.62,0.85
7,SGMM w/ Log Reg,0.26,0.74,0.72,0.49,0.74,0.73,0.58,0.82
8,SBMM w/ Log Reg,0.16,0.84,0.68,0.59,0.8,0.76,0.63,0.85


# Cadre wise prediction accuracy

In [24]:
print('Kmeans + LG')
print(metTestKMc.round(2).sort_values(by ='high_cost%').reset_index().iloc[:,np.r_[2:4,-2,-1]])
print('SGMM w/ LG')
print(metTestSGc.round(2).sort_values(by ='high_cost%').reset_index().iloc[:,np.r_[2:4,-2,-1]]) #.iloc [[1,0,2,3,4],:]) 
print('SBMM w/ LG')
print(metTestSBc.round(2).sort_values(by ='high_cost%').reset_index().iloc[:,np.r_[2:4,-2,-1]])

Kmeans + LG
     size  high_cost%    f1   auc
0  2877.0        0.13  0.57  0.90
1  3047.0        0.16  0.55  0.85
2  3175.0        0.22  0.51  0.76
3  2160.0        0.23  0.60  0.85
4  4991.0        0.24  0.56  0.79
5  3222.0        0.31  0.63  0.82
6  3221.0        0.32  0.65  0.83
7  2477.0        0.43  0.80  0.90
SGMM w/ LG
      size  high_cost%    f1   auc
0    132.0        0.06  0.63  0.95
1    228.0        0.06  0.53  0.92
2    318.0        0.12  0.49  0.91
3    653.0        0.17  0.58  0.87
4    121.0        0.18  0.49  0.86
5    106.0        0.21  0.37  0.83
6  23504.0        0.26  0.58  0.81
7    108.0        0.31  0.66  0.84
SBMM w/ LG
     size  high_cost%    f1   auc
0  3449.0        0.07  0.53  0.91
1  2486.0        0.09  0.61  0.92
2  6521.0        0.20  0.49  0.76
3  3343.0        0.24  0.55  0.79
4  1573.0        0.31  0.62  0.83
5  1012.0        0.33  0.69  0.87
6  2421.0        0.38  0.70  0.83
7  4365.0        0.48  0.71  0.76
