In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scw import SCW1, SCW2
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hinge_loss
from sklearn.utils import shuffle
from skgarden import MondrianForestClassifier
import sys
sys.path.append('../../Passive_Aggressive')
from passive_aggressive import PassiveAggressive


from functions import  NormalExample, cesa_up_final, Touati_up_final, cesa_up_bd, Touati_up_bd, Touati_up_bd_2, Touati_up_final_2, f_t_Touati, f_t_cesa
%matplotlib 

%load_ext autoreload
%autoreload 2

Using matplotlib backend: TkAgg


In [4]:
# Linearly Separable Data
example1 = NormalExample(dim=2, mean=[0.0, 0.0], cov=[[1.1, 0.1], [0.1, 1.1]])
example2 = NormalExample(dim=2, mean=[3, 3], cov=[[1.2, 0.2], [0.2, 1.2]])

n1 =5000
n2 = 5000
samples1 = example1.get_data(n1)
samples2 = example2.get_data(n2)

# Plotting Data
fig, ax = plt.subplots(figsize=(7, 4))
ax.scatter(samples1[:, 0], samples1[:, 1],
            c='#FF0000', marker='+', linewidth=1)
ax.scatter(samples2[:, 0], samples2[:, 1],
            c='#0000FF', marker="_", linewidth=1)
fig.tight_layout()

In [5]:
#Non Linearly Separable Data

def nonlinear_model(rseed=42, n_samples=10000):
    radius = 40 * np.random.random(n_samples)
    far_pts = radius > 20
    radius[far_pts] *= 1.2
    radius[~far_pts] *= 1.1
    theta = np.random.random(n_samples) * np.pi * 2
    data = np.empty((n_samples, 2))
    data[:, 0] = radius * np.cos(theta)
    data[:, 1] = radius * np.sin(theta)
    labels = np.ones(n_samples)
    labels[far_pts] = -1
    X1 = pd.DataFrame(data[:,0])
    X2 = pd.DataFrame(data[:,1])
    Y = pd.DataFrame(labels)
    df = pd.concat([X1,X2,Y],axis=1)
    df.columns = ["1","2","y"]
    return df
df_n=nonlinear_model()

In [28]:
# Data 1
#linearly separable data set 

In [6]:
X1 = pd.DataFrame(np.c_[samples1, -np.ones(samples1.shape[0], )])
X2 = pd.DataFrame(np.c_[samples2, np.ones(samples2.shape[0], )])
df = pd.concat([X1, X2])
df = shuffle(df).reset_index(drop=True)
df.columns = ['1', '2', 'y']
#sns.scatterplot(data=df, x='1', y='2', hue='y')
sns.scatterplot(data=df, x='1', y='2', hue='y')
#df.head()
#df_n.head()

<matplotlib.axes._subplots.AxesSubplot at 0x1c1271d0>

In [7]:
from tqdm import tqdm_notebook
def model(example,model_type = "Passive agressive"):
    if example == "bounds on linearly seprable simulation":
        df = pd.concat([X1, X2])
        df = shuffle(df).reset_index(drop=True)
        df.columns = ['1', '2', 'y']
        features = ['1', '2']
        target = 'y'
        X = df[features].values
        y = df[target].values
        all_ranges = [[2,100], [100, 1000], [1000, 5000], [5000, len(X)]]
    elif example == "bounds on non linearly seprable simulation":
        df=nonlinear_model()
        features = ['1', '2']
        target = 'y'
        X = df[features].values
        y = df[target].values
        all_ranges = [[2,100], [100, 1000], [1000, 5000], [5000, len(X)]]
    elif example == "bounds on ionosphere data set":
        df = pd.read_csv('../data/ionosphere.Data', header=None)
        target = 34
        features = df.columns.difference([target])
        df[target] = (df[target] == 'g') * 2 - 1
        X = df[features].values
        y = df[target].values
        all_ranges = [[2, 50], [50, 100], [100, 200], [200, len(X)]]
    elif example == "bounds on breast cancer data set":
        import sklearn.datasets
        df = sklearn.datasets.load_breast_cancer()
        df = pd.concat([pd.DataFrame(df['data']), pd.Series(df['target'], name='y')], 1)
        target = 'y'
        features = df.columns.difference([target])
        X = df[features].values
        y = df[target].values
        y = (y == 1) * 2 - 1.
        all_ranges = [[2, 100], [100,200], [200, 350], [350, len(X)]]

    all_list_n = []
    all_missclass = []
    list_k = []
    print(X.shape)
    for spread in tqdm_notebook(all_ranges):
        list_n = np.linspace(spread[0], spread[1], 10, endpoint=False).astype(int)

        missclas = np.zeros((len(list_n), ))
        for k, n in enumerate(list_n):
            model_pa = PassiveAggressive()
            model_scw = SCW1(C=1, ETA=1)
            #model_keras = keras_model(X)
            model = model_pa
            if model_type == "Passive agressive":
                model = model_pa # IMPORTANT: Change this to change the model
            else:
                model = model_scw
            n_features = len(features)
            scores = np.zeros((n-1, ))
            losses = np.zeros((n-1, ))

            # We fit the first element
            x0 = X[0].reshape(1, -1)
            y0 = y[0].reshape(-1, 1)

            if type(model) == PassiveAggressive:
                 model.fit(x0[0], y0[0][0])
            elif type(model) == SCW1:
                model = model.fit(x0, y0)
            elif type(model) == MondrianForestClassifier:
                model = model.fit(x0, y0)
            elif type(model) == KerasClassifier:
                model = model.fit(x0, y0)

            y_pred = np.zeros((n-1, ))
            y_pred_max = np.zeros((n-1, ))
            for i in range(0, n-1):
                # Prediction
                if type(model) == PassiveAggressive:
                    w = model.w
                elif type(model) == SCW1:
                    w = model.weights.ravel()
                X_next = X[i+1]
                y_pred[i] = 1 if (np.dot(X_next, w) > 0) else - 1

                xi = df[features].iloc[i+1].to_dict()

                # Fitting algorithm with new data
                if type(model) == PassiveAggressive:
                    model.fit(X_next, y[i+1])
                elif type(model) == SCW1:
                    model.update(X_next.reshape(-1, 1), y[i+1])

            missclas[k] = 1 - accuracy_score(y[1:n], y_pred)
        all_list_n.append(list_n)
        all_missclass.append(missclas)
    
    return all_list_n,all_missclass

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize

def Touati_up_bd(x, n , delta): 
    y = x - np.sqrt( ( (1-x)**2/np.log(1/x) ) * np.log(1/delta)/n)
    return y
def touati_bound():
    n = 100
    delta = 0.01

    all_y = np.linspace(0, 1, 100)
    all_x = []
    for y in all_y:
        error = lambda x: (Touati_up_bd(x, n, delta) - y)**2
        res = scipy.optimize.minimize(error, x0=0.5, bounds=[(0.01, 0.999)])
        all_x.append(res.x[0])

    bound_touati2 = np.array(all_x)
    return bound_touati2

In [9]:
bound_touati2 = touati_bound()

In [21]:

def plot_one_bound(example,model_type):
    all_list_n,all_missclass = model(example,model_type) 
    delta = 0.01
    a = 0.13 # Worst Case
    a1= optimal_bound(all_list_n,all_missclass) # Optimal bound
    all_bounds_touati = []
    all_bounds_touati1 = []
    all_bounds_cesa = []
    all_bounds_risk=[]
    all_bounds_touati2=[]
    for i, spread in enumerate(all_list_n):
        for j, n in enumerate(spread):
            y = all_missclass[i][j]
            all_bounds_risk.append(y)
            bound_touati1 = Touati_up_final(y, n, delta,a1) 
            bound_touati = Touati_up_final(y, n, delta,a)   
            bound_cesa = cesa_up_final(y, n, delta)
            all_bounds_touati.append(bound_touati)
            all_bounds_touati1.append(bound_touati1)
            all_bounds_cesa.append(bound_cesa)
            all_bounds_touati2.append(bound_touati2)

    all_bounds_touati2 = Touati_up_final_2(all_bounds_risk, n, delta)

    fig, ax = plt.subplots(figsize=(8, 6))
    x = np.concatenate(all_list_n)

    ax.plot(x, all_bounds_risk, label="Empirical risk")
    ax.plot(x, all_bounds_touati1, label="Touati optimal bound, a="+str(round(a1,3)))
    ax.plot(x, all_bounds_touati, label="Touati bound, a="+str(a))
    ax.plot(x, all_bounds_cesa, label="Cesa-Bianchi and Gentile bound")
    ax.plot(x, all_bounds_touati2, label="Touati improved optimal bound")
    ax.set_yscale('log')
    # ax.set_xscale('log')
    ax.plot(x, [1]*len(x), '--', label="Limit")
    ax.set_title(model_type + " : "+example)
    ax.set_ylabel('Risk')
    ax.set_xlabel('n')
    ax.legend(loc=0)
    fig.tight_layout()
    return fig

In [32]:
fig0_scw = plot_one_bound("bounds on linearly seprable simulation","SCW")
fig1_scw = plot_one_bound("bounds on non linearly seprable simulation","SCW")
#fig2_scw = plot_one_bound("bounds on ionosphere data set","SCW")
#fig3_scw = plot_one_bound("bounds on breast cancer data set","SCW")


fig0_pa = plot_one_bound("bounds on linearly seprable simulation","Passive agressive")
fig1_pa = plot_one_bound("bounds on non linearly seprable simulation","Passive agressive")
#fig2_pa = plot_one_bound("bounds on ionosphere data set","Passive agressive")
#fig3_pa = plot_one_bound("bounds on breast cancer data set","Passive agressive")


(10000, 2)

0.2400000000000001
0.14141645704436598
(10000, 2)

0.28000000000000014
0.20493129739416113
(10000, 2)

0.2600000000000001
0.20444679539328245
(10000, 2)

0.28000000000000014
0.24861629174252653


## Optimal bound

In [11]:
def rmse(predictions, targets):
    return np.mean(np.abs(predictions - targets))

def optimal_bound(all_list_n,all_missclass):
    start = 0.13
    end = 9/16
    a = np.arange(start,end,0.01)

    delta = 0.01
    all_bounds_touati = []
    all_bounds_cesa = []
    all_bounds_risk=[]
    list_score = []
    for a_i in a:
        for i, spread in enumerate(all_list_n):
            for j, n in enumerate(spread):
                y = all_missclass[i][j]
                all_bounds_risk.append(y)
                bound_touati = Touati_up_final(y, n, delta,a_i)
                bound_cesa = cesa_up_final(y, n, delta)
                all_bounds_touati.append(bound_touati)
                all_bounds_cesa.append(bound_cesa)

        score = rmse(np.array(bound_touati),np.array(all_bounds_risk))
        list_score.append(score)

    fig, ax = plt.subplots(figsize=(7, 4))
    x = np.concatenate(all_list_n)


    ax.plot(a, list_score, label="Empirical risk")

    fig.tight_layout()
    print(a[np.argmin(list_score)])
    print(min(list_score))
    #s = pd.Series(all_missclass)
    #print(s.isna().sum())
    #print(s)
    return a[np.argmin(list_score)]

In [3]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style
%matplotlib notebook

style.use('fivethirtyeight')

fig = plt.figure(figsize=(15,8))
ax = fig.add_subplot(1,1,1)

def animate(i):
    delta = 0.01

    all_bounds_touati = []
    all_bounds_cesa = []
    all_bounds_risk=[]
    for i, spread in enumerate(all_list_n):
        for j, n in enumerate(spread):
            y = all_missclass[i][j]
            all_bounds_risk.append(y)
            bound_touati = Touati_up_final(y, n, delta,a1)
            bound_touati1=Touati_up_final(y, n, delta,a2)
            bound_cesa = cesa_up_final(y, n, delta)
            all_bounds_touati.append(bound_touati)
            all_bounds_cesa.append(bound_cesa)
        x = np.concatenate(all_list_n)

    ax1.clear()

    ax.plot(x, all_bounds_risk, label="Empirical risk")

    ax.plot(x, all_bounds_touati, label="Touati bound")
    ax.plot(x, all_bounds_cesa, label="Cesa bound")
    ax.set_yscale('log')
    # ax.set_xscale('log')
    ax.plot(x, [1]*len(x), '--', label="Limit")
    ax.set_title('Your title')
    ax.set_ylabel('Risk')
    ax.set_xlabel('n')
    ax.legend(loc=0)    
    
ani = animation.FuncAnimation(fig, animate, interval=100)
plt.show()



Traceback (most recent call last):
  File "C:\Users\Touati\Anaconda3\lib\site-packages\matplotlib\cbook\__init__.py", line 215, in process
    func(*args, **kwargs)
  File "C:\Users\Touati\Anaconda3\lib\site-packages\matplotlib\animation.py", line 999, in _start
    self._init_draw()
  File "C:\Users\Touati\Anaconda3\lib\site-packages\matplotlib\animation.py", line 1740, in _init_draw
    self._draw_frame(next(self.new_frame_seq()))
  File "C:\Users\Touati\Anaconda3\lib\site-packages\matplotlib\animation.py", line 1762, in _draw_frame
    self._drawn_artists = self._func(framedata, *self._args)
  File "<ipython-input-3-0ed3a59135b1>", line 17, in animate
    for i, spread in enumerate(all_list_n):
NameError: name 'all_list_n' is not defined
