In [1]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score



In [2]:
file=r'/Users/lalitsachan/Dropbox/0.0 Data/census_income.csv'

cd= pd.read_csv(file)

cd['Y']=(cd['Y']==' >50K').astype(int)

del cd['education']

cat_cols=cd.select_dtypes(['object']).columns

for col in cat_cols:
    freqs=cd[col].value_counts()
    k=freqs.index[freqs>99][:-1]
    for cat in k:
        name=col+'_'+cat
        cd[name]=(cd[col]==cat).astype(int)
    del cd[col]
    print(col)
    
cd_train,cd_test=train_test_split(cd,test_size=0.2,random_state=2)

cd_train.reset_index(drop=True,inplace=True)
cd_test.reset_index(drop=True,inplace=True)

x_train=cd_train.drop(['Y'],axis=1)
y_train=cd_train['Y']

x_test=cd_test.drop(['Y'],axis=1)
y_test=cd_test['Y']

workclass
marital.status
occupation
relationship
race
sex
native.country


In [3]:
clf1=KNeighborsClassifier(n_neighbors=50)
clf2=RandomForestClassifier(class_weight='balanced',n_estimators=200)
clf3=RandomForestClassifier(class_weight=None,n_estimators=100)
clf4=XGBClassifier(n_estimators=150,objective='binary:logistic',learning_rate=.01)
clf5=XGBClassifier(n_estimators=150,objective='binary:logistic',learning_rate=.1)

Algos=[clf1,clf2,clf3,clf4,clf5]

In [4]:
rows=x_train.shape[0]

In [5]:
rows

26048

In [6]:
layer1=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows),'clf3':np.zeros(rows),
                    'clf4':np.zeros(rows),'clf5':np.zeros(rows)})

In [7]:
layer1

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0


In [8]:
kf=KFold(n_splits=10)

In [9]:
fold=1
for train,left_out_chunk in kf.split(x_train):
    print('fold number : ', fold)
    
    for i,clf in enumerate(Algos):
        print('Algo number :',i+1)
        
        x_train_train=x_train.loc[train]
        y_train_train=y_train[train]
        x_train_left_out_chunk=x_train.loc[left_out_chunk]
        
        clf.fit(x_train_train,y_train_train)
        p=clf.predict_proba(x_train_left_out_chunk)[:,1]
        
        layer1.iloc[left_out_chunk,i]=p
        
    fold+=1  
    

fold number :  1
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  2
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  3
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  4
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  5
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  6
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  7
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  8
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  9
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  10
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5


In [10]:
layer1

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.16,0.130,0.16,0.386297,0.193522
1,0.12,0.015,0.00,0.134280,0.011568
2,0.10,0.000,0.00,0.128753,0.002680
3,0.18,0.175,0.21,0.418074,0.360000
4,0.20,0.000,0.02,0.166988,0.041188
5,0.30,0.000,0.00,0.128753,0.001816
6,0.16,0.230,0.19,0.337832,0.357224
7,0.32,0.840,0.81,0.344730,0.447509
8,0.16,0.005,0.01,0.134280,0.070779
9,0.28,0.000,0.00,0.134280,0.010866


In [11]:
rows=x_test.shape[0]
layer2_test=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows),'clf3':np.zeros(rows),
                    'clf4':np.zeros(rows),'clf5':np.zeros(rows)})

In [12]:
for i,clf in enumerate(Algos):
    print( 'Algo number',i+1)
    clf.fit(x_train,y_train)
    p=clf.predict_proba(x_test)[:,1]
    
    layer2_test.iloc[:,i]=p


Algo number 1
Algo number 2
Algo number 3
Algo number 4
Algo number 5


In [13]:
layer2_test

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.22,0.065,0.09,0.132948,0.051074
1,0.26,0.010,0.01,0.132948,0.031240
2,0.12,0.295,0.21,0.592467,0.581021
3,0.16,0.170,0.20,0.337360,0.305764
4,0.24,0.065,0.07,0.133126,0.028447
5,0.20,0.720,0.74,0.296129,0.543560
6,0.22,0.700,0.79,0.601983,0.758608
7,0.26,0.110,0.16,0.343564,0.122867
8,0.26,0.890,0.84,0.727740,0.975059
9,0.16,0.025,0.02,0.168431,0.043467


In [14]:
# second layer linear model 
logr=LogisticRegression(class_weight='balanced')

In [15]:
logr.fit(layer1,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [16]:
roc_auc_score(y_test,logr.predict_proba(layer2_test)[:,1])

0.9200109334671637

In [17]:
xgb2=XGBClassifier(objective='binary:logistic',n_estimators=100,max_depth=3,learning_rate=.1,scale_pos_weight=3)

In [18]:
xgb2.fit(layer1,y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=3, seed=0, silent=True, subsample=1)

In [19]:
roc_auc_score(y_test,xgb2.predict_proba(layer2_test)[:,1])

0.9200314817280512