In [41]:
%matplotlib inline
import random
import pandas as pd
import pylab
import numpy as np
import sklearn
from sklearn import linear_model
import sklearn.preprocessing as preprocessing
import math
import cvxpy as cp
def data_transform(df):
    """Normalize features."""
    binary_data = pd.get_dummies(df)
    feature_cols = binary_data[binary_data.columns[:-2]]
    scaler = preprocessing.StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(feature_cols), columns=feature_cols.columns)
    return data

In [2]:
pd.__version__

'1.4.4'

In [3]:
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"] 

In [4]:
train_url = 'adult/adult.data'
test_url = 'adult/adult.test'

original_train = pd.read_csv(train_url, names=features, sep=r'\s*,\s*', 
                             engine='python', na_values="?")
original_test = pd.read_csv(test_url, names=features, sep=r'\s*,\s*', 
                            engine='python', na_values="?", skiprows=1)

In [5]:
num_train = len(original_train)
original = pd.concat([original_train, original_test])
roc_original = original
labels = original['Target']
labels = labels.replace('<=50K', 0).replace('>50K', 1)
labels = labels.replace('<=50K.', 0).replace('>50K.', 1)
sensitive = original['Sex']
sensitive = sensitive.replace('Male', 0).replace('Female', 1)
# Redundant column
del original["Education"]
# Remove target variable
del original["Target"]
#del original["Sex"]


In [6]:
#binary_data = pd.get_dummies(original)
#data=pd.concat([binary_data, sensitive], axis=1)
data = data_transform(original)

train_data = data[:num_train].values
train_labels = labels[:num_train].values
test_data = data[num_train:].values
test_labels = labels[num_train:].values
p=train_data.shape[1]

In [7]:
def loss(X,Y,theta):
    eta=np.dot(X,theta)
    return -(np.dot(Y,eta)-np.sum(np.log(1+np.exp(eta))))

def f_grad(X,Y,theta):
    mu=1/(1+np.exp(-np.dot(X,theta)))
    return (np.dot(X.T,Y-mu))

def predict(theta,X):
    y =1/(1+np.exp(-np.dot(X,theta)))
    return np.array((y.T>0.5).reshape(-1)).astype(int)

def constraints(X,theta,z):
    length=z.shape
    return np.array([np.dot(z,np.dot(X,theta))/length-c,-np.dot(z,np.dot(X,theta))/length-c]).reshape(-1)

def g_grad(X,z):
    length=z.shape
    return np.array([np.dot(z,X)/length,-np.dot(z,X)/length])

In [8]:
theta_0=np.ones(p)
Z=sensitive.values
Z=Z-Z.mean()
Z_train=Z[:num_train]
Z_test=Z[num_train:]
c=1.0

In [54]:
def determinstic(X,Y,z,T,C):
    lam=np.array([1.0,1.0])
    theta=theta_p=theta_pp=theta_0
    tau=C/math.sqrt(T)
    eta=C/math.sqrt(T)
    for i in range(T):
        u=2*(constraints(X,theta_p,z)+np.dot(g_grad(X,z),(theta-theta_p)))-(constraints(X,theta_pp,z)+np.dot(g_grad(X,z),(theta_p-theta_pp)))
        #u=2*constraints(X,theta,z)-constraints(X,theta_p,z)
        lam=lam+tau*u
        theta_pp=theta_p
        theta_p=theta
        theta=theta+eta*(f_grad(X,Y,theta)+np.dot(lam,g_grad(X,z)))
        theta=theta+eta*f_grad(X,Y,theta)
        if(i%10==0):
            pred=predict(theta,test_data)
            print(sklearn.metrics.accuracy_score(pred,test_labels))
            print(constraints(test_data,theta,Z_test))
        
    return theta
                                                                    

In [13]:
Beta=determinstic(train_data,train_labels,Z_train,200,0.001)

0.7131625821509735
[-1.55486726 -0.44513274]
0.728579325594251
[-1.29185285 -0.70814715]
0.7695473251028807
[-1.29499323 -0.70500677]
0.7967569559609361
[-1.3079621 -0.6920379]
0.809225477550519
[-1.32127949 -0.67872051]
0.8163503470302806
[-1.33311853 -0.66688147]
0.8200970456360175
[-1.34381262 -0.65618738]
0.822369633314907
[-1.35356556 -0.64643444]
0.8245807997051778
[-1.36236115 -0.63763885]
0.8256249616116946
[-1.37022416 -0.62977584]
0.8264234383637369
[-1.37722346 -0.62277654]
0.826914808672686
[-1.38344352 -0.61655648]
0.827406178981635
[-1.3889691 -0.6110309]
0.8277132854247282
[-1.39387903 -0.60612097]
0.8283889195995332
[-1.39824433 -0.60175567]
0.8291259750629568
[-1.40212796 -0.59787204]
0.8288188686198636
[-1.40558545 -0.59441455]
0.8288802899084823
[-1.40866558 -0.59133442]
0.8294945027946686
[-1.41141124 -0.58858876]
0.8299858731036177
[-1.41386015 -0.58613985]


In [14]:
pred=predict(Beta,test_data)
print(sklearn.metrics.accuracy_score(pred,test_labels))

0.8299858731036177


In [45]:
def stochastic(X_data,Y_data,z_data,C,Batch,Epoch):
    lam=np.array([1.0,1.0])
    theta=theta_p=theta_pp=theta_0
    tau=C/math.sqrt(Epoch)
    eta=C/math.sqrt(Epoch)
    for i in range(Epoch):
        rd=random.randint(0,num_train)
        if rd+Batch>num_train:
            X=np.concatenate((X_data[rd:num_train],X_data[0:Batch+rd-num_train]),axis=0)
            Y=np.concatenate((Y_data[rd:num_train],Y_data[0:Batch+rd-num_train]),axis=0)
            z=np.concatenate((z_data[rd:num_train],z_data[0:Batch+rd-num_train]),axis=0)
        X=X_data[rd:rd+Batch]
        Y=Y_data[rd:rd+Batch]
        z=z_data[rd:rd+Batch]
        u=2*(constraints(X,theta_p,z)+np.dot(g_grad(X,z),(theta-theta_p)))-(constraints(X,theta_pp,z)+np.dot(g_grad(X,z),(theta_p-theta_pp)))
        lam=lam+tau*u
        theta_pp=theta_p
        theta_p=theta
        theta=theta+eta*(f_grad(X,Y,theta)+np.dot(lam,g_grad(X,z)))
        theta=theta+eta*f_grad(X,Y,theta)
        if i%100==0:
            pred=predict(theta,test_data)
            print(sklearn.metrics.accuracy_score(pred,test_labels))
            print(constraints(test_data,theta,Z_test))
        
    return theta

In [57]:
Beta=stochastic(train_data,train_labels,Z_train,0.1,100,10000)

0.6443093176094834
[-0.99912627 -1.00087373]
0.7211473496713962
[-1.34291347 -0.65708653]
0.7278422701308274
[-1.2894367 -0.7105633]
0.7463914992936552
[-1.25239932 -0.74760068]
0.7535777900620355
[-1.31154237 -0.68845763]
0.7681960567532706
[-1.28258401 -0.71741599]
0.7804803144769977
[-1.29220446 -0.70779554]
0.7864381794730053
[-1.29051921 -0.70948079]
0.7925803083348688
[-1.34737258 -0.65262742]
0.7973711688471224
[-1.35881576 -0.64118424]
0.8074442601805786
[-1.35045049 -0.64954951]
0.8036361402862232
[-1.34866489 -0.65133511]
0.8125422271359253
[-1.34510806 -0.65489194]
0.8129107548676371
[-1.33977485 -0.66022515]
0.8147533935261961
[-1.35322739 -0.64677261]
0.8137092316196793
[-1.38255508 -0.61744492]
0.8190528837295007
[-1.36561234 -0.63438766]
0.8214483139856275
[-1.34086204 -0.65913796]
0.8204041520791107
[-1.37372372 -0.62627628]
0.8190528837295007
[-1.38491712 -0.61508288]
0.8186229347091702
[-1.35038398 -0.64961602]
0.8223082120262883
[-1.40061083 -0.59938917]
0.8199127817

In [35]:
beta = cp.Variable(p)
log_likelihood = cp.sum(
    cp.multiply(labels.values, data.values @ beta) - cp.logistic(data.values @ beta)
)
problem = cp.Problem(cp.Maximize(log_likelihood/len(data)))

In [36]:
problem.solve()

-0.41776078467316236

In [49]:
b=beta.value

In [50]:
pred=predict(b,test_data)
print(sklearn.metrics.accuracy_score(pred,test_labels))

0.8322584607825072


In [53]:
print(np.sum((b-Beta)**2)/p)

0.20327206892420407
