In [1]:
import numpy as np, os
import pandas as pd
from sklearn.base import ClassifierMixin,clone,BaseEstimator,clone
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from libs import read_uci, corrupt_label, noisy_evaluate

In [2]:
datasets = ['heart','german','breast_cancer','diabetis','thyroid','image','banana']

In [51]:
cm = np.array([
    [.9,.1],
    [.3,.7]
])
X,y = load_data('diabetis')
X.shape,np.unique(y,return_counts=True)

((768, 8), (array([0, 1], dtype=int32), array([500, 268])))

In [52]:
yn = corrupt_label(y,cm)
cm = confusion_matrix(y,yn,normalize='true')
a_best = (1-cm[1][0]+cm[0][1])/2
cm,a_best

(array([[0.916     , 0.084     ],
        [0.29104478, 0.70895522]]), 0.39647761194029846)

In [53]:
alphas = np.linspace(0.01,.99,20)
vals = []
for a in alphas:
    vals.append({1:1-a,0:a})
params = {'class_weight':vals}

In [54]:
svc = SVC(gamma=1.0)
clf = GridSearchCV(svc,params,cv=StratifiedShuffleSplit(n_splits=3))
clf = clf.fit(X,yn)
clf.best_params_

{'class_weight': {1: 0.6289473684210527, 0: 0.37105263157894736}}

In [55]:
scores = []
for _ in range(5):
    yn = corrupt_label(y,cm)
    CV = RepeatedStratifiedKFold(n_splits=5,n_repeats=1)
    r = noisy_evaluate(clf,X,yn,y,CV,[accuracy_score])
    scores.append(list(r.values())[0])
sum(scores)/len(scores)

0.6593956370426959