In [19]:
import testjx
import pandas as pd
import numpy as np
from scipy.optimize import fsolve
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report
import scipy.stats as st

from sklearn.linear_model import LogisticRegression
import time
from sklearn.metrics import fbeta_score,f1_score,balanced_accuracy_score,recall_score,precision_score
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from cvxopt import matrix
from cvxopt import solvers

class Volatility_uncertain:
    def __init__(self,model=LogisticRegression(),n=100):
        self.n=n
        self.model=model
        
        
        
    def F_u(self,x,arr):
        p=[]
        for c in arr:

            if c <0:
                p.append(2*x[1]*st.norm.cdf(c/x[1])/(x[0]+x[1]))
            else:
                p.append(1-2*x[0]*st.norm.cdf(-c/x[0])/(x[0]+x[1]))
        return np.array(p)
    def F_L(self,x,arr):
        p=[]
        for c in arr:
            if c <0:
                p.append(2*x[0]*st.norm.cdf(c/x[0])/(x[0]+x[1]))
            else:
                p.append(1-2*x[1]*st.norm.cdf(-c/x[1])/(x[0]+x[1]))
        return np.array(p)
    
    def error(self,X,Y):
        self.model.fit(X,Y)
        pre_in=(np.dot(X,self.model.coef_.T)+self.model.intercept_).reshape(X.shape[0],)
        return pre_in
    def meanuncertainty(self,x):
        r=[]
        for i in range(0,len(x)+1-self.n,self.n//10):
            r.append(np.mean(x[i:i+self.n]))
        return min(r),max(r)

    def equa_v(self,x,X,Y):
        train_err=self.error(X,Y)
        err_u=Y-self.F_u(x,train_err)
        err_L=Y-self.F_L(x,train_err)
        r=0.5*Y.shape[0]/sum(Y)
        for k in range(Y.shape[0]):

            if Y[k]==1:
                err_u[k]=r*err_u[k]
                err_L[k]=r*err_L[k]

        return np.array([self.meanuncertainty(err_u)[1],self.meanuncertainty(err_L)[0]])
 
    

   
    
    def volatility_uncertainty(self,X,Y):
        mean_lr=fsolve(lambda x:self.equa_v(x,X,Y),[0.5,1.5] )
        return mean_lr
    
    def predict(self,X,Y,x_te,mean_u):
        self.model.fit(X,Y)
        prob_y=self.F_u(mean_u,(np.dot(x_te,self.model.coef_.T)+self.model.intercept_).reshape(x_te.shape[0],))
        
                
        prdict_y=np.round(prob_y)
        return prdict_y

In [21]:
cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)

def voluncertain_CV(X,y,model=Volatility_uncertain()):
    
    bacc,f2,rec,pre=list(),list(),list(),list()
    volatility=model.volatility_uncertainty(X,y)
    for train_index,test_index in cv.split(X,y):
            
            prdict_y=model.predict(X[train_index],y[train_index],X[test_index],volatility)
            rec.append(recall_score(y[test_index],prdict_y))
            pre.append(precision_score(y[test_index],prdict_y))

            bacc.append(balanced_accuracy_score(y[test_index],prdict_y))
            f2.append(fbeta_score(y[test_index],prdict_y,beta=2))
    return volatility,bacc,f2,rec,pre


In [12]:
ecoli=pd.read_csv('/Users/lvjingzhe/Desktop/璇/modified_althogram/DRM/data/ecoli_.csv',)
L=[]
for k in ecoli['Class'].values:
    if k in ['0','1','2','3' ]:
        L.append(0)
    else:L.append(1)
ecoli['Class']=L    
E=np.array(ecoli)
np.random.seed(4123)
np.random.shuffle(E)
X_sample=E[:,:-1]/np.max(E[:,:-1],axis=0)
y_sample=E[:,-1]


In [13]:
x_tr, x_te, y_tr, y_te = train_test_split(X_sample,y_sample,test_size = 0.2,
                                                  shuffle = True,
                                                  random_state = 10)

In [24]:
metrics_names=['Balanced_acc','F2_score','Recall','Precision']
all_results_m = []
N_list=np.array(range(50,200,5))
c=0
strat=time.time()

for k in N_list:
    M=voluncertain_CV(x_tr,y_tr,model=Volatility_uncertain(n=k))
    Lo_score=np.mean(M[1:],axis=1)
    metric_res = {'window':k,'Volati-level': M[0]}
    
    for name, value in zip(metrics_names,Lo_score):
            metric_res[name] = value


    all_results_m.append(metric_res)


eva=pd.DataFrame(all_results_m)
bias=eva[eva.iloc[:,-4]==(eva.iloc[:,-4]).max()]
N_optimal=bias['window'].values[0]
print("optimal window size N by fivefold CV: %.3f "%(bias['window']))

optimal window size N by fivefold CV: 65.000 


In [28]:

M=Volatility_uncertain(n=65)
mean_u=M.volatility_uncertainty(x_tr,y_tr)
predict_y=M.predict(x_tr,y_tr,x_te,mean_u)
print(" Volati-level : ",(mean_u))

print(classification_report(y_te,predict_y))
print('>%s: Average G-mean:%.3f ' % ('LR_mean:',np.sqrt(recall_score(y_te,predict_y)*recall_score(y_te,predict_y,pos_label=0))))
print('>%s: Average Balanced_Acc: %.3f ' % ('LR_mean:',balanced_accuracy_score(y_te,predict_y)))
print('>%s: Average Fbeta: %.3f' % ('LR_mean:',fbeta_score(y_te,predict_y,beta=max(2,np.log(y_tr.shape[0]/sum(y_tr)-1)))))
print('>%s: Average Recall: %.3f' % ('LR_mean:',recall_score(y_te,predict_y)))    


 Volati-level :  [3.45465833 5.83695363]
              precision    recall  f1-score   support

           0       0.95      0.92      0.93        61
           1       0.44      0.57      0.50         7

    accuracy                           0.88        68
   macro avg       0.70      0.74      0.72        68
weighted avg       0.90      0.88      0.89        68

>LR_mean:: Average G-mean:0.724 
>LR_mean:: Average Balanced_Acc: 0.745 
>LR_mean:: Average Fbeta: 0.548
>LR_mean:: Average Recall: 0.571
