In [1]:
import numpy as np,os
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model._logistic import _logistic_regression_path, _logistic_loss, _intercept_dot
from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold, GridSearchCV, 
                                     StratifiedShuffleSplit, RandomizedSearchCV, ShuffleSplit)
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.extmath import log_logistic
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from libs import corrupt_label, load_mat, noisy_evaluate

This implementation performs better compared to loss_log sometimes, and sometimes worse. Problem is the margin tends to be pretty wide in both cases. I think, but not absolutely certain, that this is correct, complete implementation.

In [2]:
def log_loss(wp,X,target,C,PN,NP):
    c = wp[-1]
    w = wp[:-1]
    z = np.dot(X,w) + c
    yz = target * z
    nyz = -target * z
    ls = -log_logistic(yz)
    nls = -log_logistic(nyz)
    idx = target==1
    loss = ls.copy()
    loss[idx] = (1-NP)*ls[idx] - PN*nls[idx]
    loss[~idx] = (1-PN)*ls[~idx] - NP*nls[~idx]
    loss = loss/(1-PN-NP) + .5 * C * np.dot(w, w)
    return loss.sum()

In [3]:
class Logit(LogisticRegression,BaseEstimator,ClassifierMixin):
    def __init__(self,PN=.2, NP=.2, robust=True,C=1.0,max_iter=100):
        super().__init__(C=C,max_iter=max_iter)
        self.PN = PN
        self.NP = NP
        self.robust= robust
    
    def fit(self,X,y):
        w0 = np.zeros(X.shape[1]+1)
        target = y.copy()
        target[target==0] = -1
        if self.robust:
            self.r_ = minimize(log_loss,w0,method="L-BFGS-B",args=(X, target, self.C,self.PN,self.NP),
                               options={"maxiter": self.max_iter})
        else:
            self.r_ = minimize(_logistic_loss,w0,method="L-BFGS-B",args=(X, target, self.C),options={"maxiter": self.max_iter})

    def predict(self,X):
        c = self.r_.x[-1]
        w = self.r_.x[:-1]
        z = np.dot(X,w) + c
        y = z.reshape(-1)
        y[y<=0] = 0
        y[y>0] = 1
        return y

In [4]:
def create_noise(y,PN,NP):
    yn = y.copy()
    idp = np.argwhere(y==1).ravel()
    idp = np.random.choice(idp,size=int(len(idp)*PN),replace=False)
    yn[idp] = 0
    idp = np.argwhere(y==0).ravel()
    idp = np.random.choice(idp,size=int(len(idp)*NP),replace=False)
    yn[idp] = 1
    return yn

In [5]:
def compute(clf,X,y,PN,NP):
    scores = []
    for _ in range(5):
        yn = create_noise(y,PN=PN,NP=NP)
        CV = ShuffleSplit(n_splits=3,test_size=.33)
        r = noisy_evaluate(clf,X,yn,y,CV,[accuracy_score])
        scores.append(list(r.values())[0]*100)
        print(list(r.values())[0]*100)
    return round(sum(scores)/len(scores),3)

In [6]:
params = {
    'PN':np.linspace(.1,.40,7),
    'NP':np.linspace(.1,.40,7),
    'C':np.logspace(-4,4,10)
}
lr = Logit(max_iter=400)
grid_cv = StratifiedShuffleSplit(n_splits=2,test_size=.25)
clf = RandomizedSearchCV(lr,params,n_iter=80,cv=grid_cv,n_jobs=2)
#clf = GridSearchCV(lr,params,cv=StratifiedShuffleSplit(n_splits=3))

In [7]:
datasets = ['heart','german','breast_cancer','diabetis','thyroid','image','banana']
#datasets = ['heart','image']
PN = .4
NP = .4
for d in datasets:
    X,y = load_mat(d) 
    print(d,X.shape,np.unique(y,return_counts=True))
    res = compute(clf,X,y,PN,NP)
    print(res)
    print()

heart (270, 13) (array([0, 1], dtype=int64), array([150, 120], dtype=int64))
68.14814814814815
63.703703703703695
56.29629629629629
70.74074074074073
55.925925925925924
62.963

german (1000, 20) (array([0, 1], dtype=int64), array([700, 300], dtype=int64))
63.131313131313135
66.06060606060606
69.29292929292929
66.56565656565657
71.31313131313131
67.273

breast_cancer (263, 9) (array([0, 1], dtype=int64), array([186,  77], dtype=int64))
71.64750957854406
67.04980842911877
59.38697318007663
72.79693486590037
65.90038314176245
67.356

diabetis (768, 8) (array([0, 1], dtype=int64), array([500, 268], dtype=int64))
68.50393700787401
64.43569553805774
67.58530183727034
67.97900262467192
70.60367454068242
67.822

thyroid (215, 5) (array([0, 1], dtype=int64), array([150,  65], dtype=int64))
72.7699530516432
77.46478873239437
80.75117370892019
68.54460093896714
75.5868544600939
75.023

image (2086, 18) (array([0, 1], dtype=int64), array([ 898, 1188], dtype=int64))
76.19738751814224
74.79438800193