In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

train = pd.read_csv("train.csv")
warnings.filterwarnings('ignore')
#train.drop(['education'], axis=1,inplace=True)
#train.drop(['relationship'], axis=1,inplace=True)

%matplotlib inline

In [21]:
y_label = train['income']
train.drop(['income'], axis=1, inplace=True)

train = pd.get_dummies(train)

### Data Preprocessing

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf_clf = RandomForestClassifier(n_estimators=100,max_features='sqrt',max_depth=None,min_samples_split=2,bootstrap=True)
rf_scores = cross_val_score(rf_clf,train,y_label,cv=10)
rf_scores

array([0.8569047 , 0.84912476, 0.85218116, 0.854404  , 0.85579328,
       0.86190609, 0.85495971, 0.84745763, 0.86079467, 0.85376703])

In [23]:
rf_clf.fit(train,y_label)
importances=rf_clf.feature_importances_
names=train.columns.values.tolist()
sort_lst=sorted(zip(map(lambda x: round(x,3),importances),names), reverse=True)
#importances

In [24]:
train_new = pd.read_csv("train.csv")
train_new.drop(['income'], axis=1,inplace=True)

new_country = []
for item in list(train_new['native-country']):
    if item != 'United-States':
        new_country.append('Others')
    else:
        new_country.append(item)
        

train_new['native-country'] = new_country
train_new = pd.get_dummies(train_new)

### Customized Score Function

In [45]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics.scorer import make_scorer

def diff(pred, x_test):
    gender = list(x_test)
    g0_actual = [i for i,label in enumerate(gender) if label==0]
    g1_actual = [i for i,label in enumerate(gender) if label==1]
    pred_1 = [i for i,label in enumerate(pred) if label==1]
    
    diff = abs((len(set(g0_actual)&set(pred_1))/len(g0_actual))-(len(set(g1_actual)&set(pred_1))/len(g1_actual)))
    return diff

def score(diff, accuracy):
    if diff<=0.1:
        return accuracy
    else:
        score=accuracy-7**(diff-0.1)+1
        return score

def loss_function(y,y_pred,greater_is_better=True):
    acc=(y==y_pred).mean()
    diff_=diff(y,train['gender'])
    return score(diff_,acc)

my_scorer=make_scorer(loss_function,greater_is_better=True)

### Model Training

#### i. Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

best_score_lr=0
for c in [0.001,0.01,0.1,1,10,50]:
    for weight in [None,'balanced']:
        fair_=[]
        time=0
        while (time<5):
            x_train,x_test,y_train,y_test = train_test_split(train_new,y_label,test_size=0.25,random_state=None)
            lr = LogisticRegression(penalty='l2',class_weight=weight,C=c).fit(x_train,y_train)
            acc=lr.score(x_test,y_test)
            pred = list(lr.predict(x_test))
            gender = list(x_test['gender'])
            g0_act = [i for i,label in enumerate(gender) if label==0]
            g1_act = [i for i,label in enumerate(gender) if label==1]
            label_1_pred = [i for i,label in enumerate(pred) if label==1]
            
            diff=abs((len(set(g0_act)&set(label_1_pred))/len(g0_act))-(len(set(g1_act)&set(label_1_pred))/len(g1_act)))
            if diff <= 0.1:
                fair_.append(acc)
            else:
                acc = acc-7**(diff-0.1)+1
                fair_.append(acc)
            time += 1
            
        mean = np.mean(fair_)
        if mean > best_score_lr:
            best_score_lr = mean
            best_param_lr = {'class_weight':weight,'C':c}

best_score_lr

0.8000444592641992

#### ii. XGBoost

In [37]:
import xgboost as xgb

best_score_xgb = 0
for est in [50,100,500]:
    for lr in [0.0005,0.001,0.005,0.1,1]:
        fair_=[]
        time=0
        while (time<5):
            x_train,x_test,y_train,y_test = train_test_split(train_new,y_label,test_size=0.25,random_state=None)
            xgb_lr = xgb.XGBClassifier(learning_rate=lr, n_estimators=est).fit(x_train,y_train)
            acc=xgb_lr.score(x_test,y_test)
            pred = list(xgb_lr.predict(x_test))
            gender = list(x_test['gender'])
            g0_act = [i for i,label in enumerate(gender) if label==0]
            g1_act = [i for i,label in enumerate(gender) if label==1]
            label_1_pred = [i for i,label in enumerate(pred) if label==1]
            
            diff=abs((len(set(g0_act)&set(label_1_pred))/len(g0_act))-(len(set(g1_act)&set(label_1_pred))/len(g1_act)))
            if diff <= 0.1:
                fair_.append(acc)
            else:
                acc = acc-7**(diff-0.1)+1
                fair_.append(acc)
            time += 1
            
        mean = np.mean(fair_)
        if mean > best_score_xgb:
            best_score_xgb = mean
            best_param_xgb = {'n_estimators':est, 'learning_rate':lr}
            
best_score_xgb

0.7535037996958146

#### iii. Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

best_score_rf = 0
for est in [50,100,200,500,1000]:
    for fea in np.arange(4,22,4):
        fair_=[]
        time=0
        while (time<5):
            x_train,x_test,y_train,y_test = train_test_split(train_new,y_label,test_size=0.25,random_state=None)
            rf = RandomForestClassifier(n_estimators=est,max_features=fea,max_depth=None,min_samples_split=2,bootstrap=True).fit(x_train,y_train)
            acc = rf.score(x_test, y_test)
            pred = list(rf.predict(x_test))
            gender = list(x_test['gender'])
            g0_act = [i for i,label in enumerate(gender) if label==0]
            g1_act = [i for i,label in enumerate(gender) if label==1]
            label_1_pred = [i for i,label in enumerate(pred) if label==1]

            diff = abs((len(set(g0_act)&set(label_1_pred))/len(g0_act))-(len(set(g1_act)&set(label_1_pred))/len(g1_act)))
            if diff <= 0.1:
                fair_.append(acc)
            else:
                acc = acc-7**(diff-0.1)+1
                fair_.append(acc)
            time += 1
        
        mean = np.mean(fair_)
        if mean > best_score_rf:
            best_score_rf = mean
            best_param_rf = {'n_estimators':est, 'max_features':fea}
            
best_score_rf

0.7020387847207541

We can conclude that for XGBoost, the optimal model has an overall score of 0.75 (almost has DDP value of 0.17), while Random Forest only has 0.70 at most. In terms of Logistic Regression classifier, it has the accuracy of 0.80 but a lower bias (DDP value is only 0.05).

### VoteClassifier to combine LR & XGBoost

In [47]:
from sklearn.ensemble import VotingClassifier

xgb_model = xgb.XGBClassifier(learning_rate=best_param_xgb['learning_rate'], 
                              n_estimators=best_param_xgb['n_estimators'])

lr_model = LogisticRegression(penalty='l2',class_weight=best_param_lr['class_weight'],
                              C=best_param_lr['C'])

estimators=[('rf', lr_model), ('xgb', xgb_model)]
ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(train_new, y_label)

loss_function(ensemble.predict(train_new), y_label)

0.8222463043236634

In [50]:
## Two XGBoost models and one LR model get a best loss_function_score, which is 0.836
estimators_2=[('rf', lr_model), ('xgb', xgb_model),('xgb_2', xgb_model)]
ensemble_2 = VotingClassifier(estimators_2, voting='soft')
ensemble_2.fit(train_new, y_label)

loss_function(ensemble_2.predict(train_new), y_label)

0.8358351001460346

For further improvement, we would import voting classifier, which includes both XGBoost and Logistic Regression. That means we could achieve higher accuracy by just using Logistic Regression or lower bias by just using XGBoost. Comparing two ways of handling features, we used two XGBoost models and one Logistic Regression model to do the voting, causing a result of 0.836 accuracy and 0.095 DDP.