In [3]:
import numpy as np,os,sys
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model._logistic import _logistic_regression_path, _logistic_loss, _intercept_dot
from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold, GridSearchCV, 
                                     StratifiedShuffleSplit, RandomizedSearchCV, ShuffleSplit)
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, matthews_corrcoef
from sklearn.utils.extmath import log_logistic
from scipy.optimize import minimize
import matplotlib.pyplot as plt
sys.path.insert(1, os.path.join(sys.path[0], '.'))
#corrupt_label, load_mat, noisy_evaluate

ValueError: attempted relative import beyond top-level package

This implementation performs better compared to loss_log sometimes, and sometimes worse. Problem is the margin tends to be pretty wide in both cases. I think, but not absolutely certain, that this is correct, complete implementation.

In [None]:
def log_loss(wp,X,target,C,PN,NP): 
    """wp=Coefficients+Intercept, X=N*M data matrix, Y=N sized target, C=regularization, PN=p+ or % of Positive samples labeled as Negative
    It is minimized using "L-BFGS-B" method of "scipy.optimize.minimize" function, and results in 
    similar coefficients as sklearn's Logistic Regression when PN=NP=0"""
    c = wp[-1]
    w = wp[:-1]
    z = np.dot(X,w) + c
    yz = target * z    #to compute l(t,y)
    nyz = -target * z  #to compute l(t,-y)
    ls = -log_logistic(yz)   #l(t,y)
    nls = -log_logistic(nyz) #l(t,-y)
    idx = target==1          #indexes of samples w/ P label
    loss = ls.copy()         #To store l-hat
    loss[idx] = (1-NP)*ls[idx] - PN*nls[idx]     #Modified loss for P samples
    loss[~idx] = (1-PN)*ls[~idx] - NP*nls[~idx]  #Modified loss for N samples
    loss = loss/(1-PN-NP) + .5 * C * np.dot(w, w) #Normalization & regulaqization
    return loss.sum()                             # Final loss

In [None]:
class Logit(LogisticRegression,BaseEstimator,ClassifierMixin):
    def __init__(self,PN=.2, NP=.2, robust=True,C=1.0,max_iter=100):
        super().__init__(C=C,max_iter=max_iter)
        self.PN = PN
        self.NP = NP
        self.robust= robust
    
    def fit(self,X,y):
        w0 = np.zeros(X.shape[1]+1)
        target = y.copy()
        target[target==0] = -1
        if self.robust:
            self.r_ = minimize(log_loss,w0,method="L-BFGS-B",args=(X, target, self.C,self.PN,self.NP),
                               options={"maxiter": self.max_iter})
        else:
            self.r_ = minimize(_logistic_loss,w0,method="L-BFGS-B",args=(X, target, self.C),options={"maxiter": self.max_iter})

    def predict(self,X):
        c = self.r_.x[-1]
        w = self.r_.x[:-1]
        z = np.dot(X,w) + c
        y = z.reshape(-1)
        y[y<=0] = 0
        y[y>0] = 1
        return y

In [None]:
def create_noise(y,PN,NP):
    yn = y.copy()
    idp = np.argwhere(y==1).ravel()
    idp = np.random.choice(idp,size=int(len(idp)*PN),replace=False)
    yn[idp] = 0
    idp = np.argwhere(y==0).ravel()
    idp = np.random.choice(idp,size=int(len(idp)*NP),replace=False)
    yn[idp] = 1
    return yn

In [None]:
def compute(clf,X,y,PN,NP):
    scores = []
    for _ in range(5):
        yn = create_noise(y,PN=PN,NP=NP)
        CV = ShuffleSplit(n_splits=3,test_size=.33)
        r = noisy_evaluate(clf,X,yn,y,CV,[accuracy_score])
        scores.append(list(r.values())[0]*100)
        print(list(r.values())[0]*100)
    return round(sum(scores)/len(scores),3)

In [None]:
params = {
    'PN':np.linspace(.1,.40,7),
    'NP':np.linspace(.1,.40,7),
    'C':np.logspace(-4,4,10)
}
lr = Logit(max_iter=400)
grid_cv = StratifiedShuffleSplit(n_splits=2,test_size=.25)
robust_lr = RandomizedSearchCV(lr,params,n_iter=20,cv=grid_cv,n_jobs=3)
#clf = GridSearchCV(lr,params,cv=StratifiedShuffleSplit(n_splits=3))

In [None]:
datasets = ['heart','german','breast_cancer','diabetis','thyroid','image','banana']
#datasets = ['heart','image']
PN = .4
NP = .4
for d in datasets:
    X,y = load_mat(d) 
    print(d,X.shape,np.unique(y,return_counts=True))
    res = compute(clf,X,y,PN,NP)
    print(res)
    print()

## Imbalanced - JIRA Datasets

In [None]:
from libs import read_jira
datasets = [f for f in os.listdir("JIRA/") if 'csv' in f]
datasets = np.random.choice(datasets,size=5)

In [None]:
grid_cv = StratifiedShuffleSplit(n_splits=2,test_size=.25)
lr = GridSearchCV(LogisticRegression(max_iter=800),{'C':np.logspace(-4,4,10)},cv=grid_cv,n_jobs=3)
SCORERS = [accuracy_score,f1_score,matthews_corrcoef]
CV = StratifiedShuffleSplit(n_splits=10,test_size=.20)

In [None]:
for d in datasets:
    X,yn,y = read_jira(d)
    cm = confusion_matrix(y,yn)
    rlr = noisy_evaluate(lr,X,yn,y,CV,SCORERS)
    rrobust = noisy_evaluate(robust_lr,X,yn,y,CV,SCORERS)
    for m in SCORERS:
        print(m.__name__)
        print("LR:",rlr[m])
        print("Robust:",rrobust[m])
    print()

In [None]:
%%time
robust_lr.fit(X,yn)