# US CENSUS - Prediction
## 0 - Import some libraries & Datasets

In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.model_selection import GridSearchCV

In [2]:
PATH = '../Data/'
FILE = os.listdir(PATH)
print(FILE)

['census_income_learn.csv', 'census_income_metadata.txt', 'census_income_test.csv', 'test.csv', 'test_1.csv', 'train.csv', 'train_1.csv', 'us_census_full.zip']


In [3]:
df=pd.read_csv(PATH+'train.csv', index_col=0)
test=pd.read_csv(PATH+'test.csv', index_col=0)

## 1 - Dataset transformation

In [4]:
col = df.columns

In [5]:
to_dummy = [c for c in col[:-1] if df[c].nunique()<10]
to_cat = [c for c in col[:-1] if (df[c].nunique()>=10)&(df[c].nunique()<60)]

In [6]:
# Modifies column values to have integers sorted according to column 50k$/y
def from_txt_2_int_train(col):
    li = [(v, (df.loc[df[col]==v, '50k'].sum() / df[df[col]==v].shape[0])) for v in df[col].value_counts().index]
    lis = sorted(li, key=lambda tup: tup[1])
    dico = dict(zip([k for k,v in lis], [k for k,v in enumerate(lis)]))
    return df[col].apply(lambda x: dico[x]), dico

def from_txt_2_int_test(col, dico):
    return test[col].apply(lambda x: dico[x])

In [7]:
transcript_values = {}
for c in to_cat:
    df[c], transcript_values[c] = from_txt_2_int_train(c)
    test[c] = from_txt_2_int_test(c, transcript_values[c])

In [8]:
#Modifies column to have dummy variables
df = pd.get_dummies(df, columns=to_dummy)
test = pd.get_dummies(test, columns=to_dummy)

In [9]:
# Create age dummy
df['age_inf_20'] = df['age'].apply(lambda x: 1 if x<20 else 0)
df['age_bet_20_35_and_60_75'] = df['age'].apply(lambda x: 1 if ((x>=20)&(x<35))|((x>=60)&(x<75)) else 0)
df['age_bet_35_60'] = df['age'].apply(lambda x: 1 if (x>=35)&(x<60) else 0)
df['age_more_75'] = df['age'].apply(lambda x: 1 if x>= 75 else 0)

test['age_inf_20'] = test['age'].apply(lambda x: 1 if x<20 else 0)
test['age_bet_20_35_and_60_75'] = test['age'].apply(lambda x: 1 if ((x>=20)&(x<35))|((x>=60)&(x<75)) else 0)
test['age_bet_35_60'] = test['age'].apply(lambda x: 1 if (x>=35)&(x<60) else 0)
test['age_more_75'] = test['age'].apply(lambda x: 1 if x>= 75 else 0)

In [10]:
[c for c in df.columns if (df[c].dtypes!='int64')&(df[c].dtypes!='uint8')&(df[c].dtypes!='float64')]

[]

## 2 - Features selection

In [11]:
corr_col = [(col, np.abs(df[[col,'50k']].corr().iloc[0,1])) for col in df.columns]
corr_col = sorted(corr_col, key=lambda tup: tup[1], reverse=True)[1:]
print(len(corr_col))

144


In [12]:
# Select the k most correlated columns
to_keep = 20

# Display the k most correlated columns with 50k
corr_col[:to_keep]

[('detailed occupation recode', 0.39445046744006806),
 ('major occupation code', 0.35226036676879025),
 ('detailed industry recode', 0.29094785527741307),
 ('education', 0.2777817738202723),
 ('major industry code', 0.2763712710929725),
 ('weeks worked in year', 0.2654797785028283),
 ('capital gains', 0.2407248197157397),
 ('num persons worked for employer_0', 0.22287854954836495),
 ('age_bet_35_60', 0.2214058002993831),
 ('class of worker_Unknown', 0.2209135202242311),
 ('detailed household summary in household_ Householder', 0.21277756385233876),
 ('tax filer stat_ Joint both under 65', 0.2051930915097182),
 ('detailed household and family stat', 0.19897545924268126),
 ('tax filer stat_ Nonfiler', 0.19832556074033597),
 ('marital stat_ Married-civilian spouse present', 0.18394926568754547),
 ('marital stat_ Never married', 0.17816741183700655),
 ('dividends from stocks', 0.17577945855195706),
 ('num persons worked for employer_6', 0.172688869323927),
 ('age_inf_20', 0.170120210509332

## 3 - Prediction

In [13]:
def display_scores(features, target, features_test, target_test, model):
    print('Train')
    print('-'*10)
    print('Score : ', model.score(features, target))
    print('Confusion matrix : ', '\n',confusion_matrix(model.predict(features), target))
    print('True positive : ', precision_score(model.predict(features), target))

    print('\n')
    print('Dev')
    print('-'*15)
    print('Score : ', accuracy_score(model.predict(features_test), target_test))
    print('Confusion matrix : ', '\n', confusion_matrix(model.predict(features_test), target_test))
    print('True positive : ', precision_score(model.predict(features_test), target_test))
    return

### a - Logistic Regression

In [14]:
# Grid Search to find best parameters
# Parameters
to_keep_ = [40,50,60,100]
C_=[0.5,1,1.5]

scores = []

for to_keep in to_keep_:
    for C in C_:
        # Features
        corr_col_to_keep = [l[0] for l in corr_col[:to_keep]]

        features = df[corr_col_to_keep].values
        target = df['50k'].values

        features_test = test[corr_col_to_keep].values
        target_test = test['50k'].values

        # Model
        model = LogisticRegression(C=C)
        model = model.fit(features, target)

        # Scores
        scores.append((to_keep, C, 
                       accuracy_score(model.predict(features), target),
                       precision_score(model.predict(features), target),
                       accuracy_score(model.predict(features_test),target_test),
                       precision_score(model.predict(features_test), target_test)))

In [15]:
# Select best parameters in function of True Positive rate (because of the distribution of 50k column)
best_params = sorted(scores, key=lambda tup: tup[-1], reverse=True)[0]
print(best_params)

(50, 1.5, 0.9522611428256391, 0.3811177515748667, 0.9529379924219643, 0.38441642418364047)


In [16]:
# Parameters
to_keep = best_params[0]
C = best_params[1]

# Features
corr_col_to_keep = [l[0] for l in corr_col[:to_keep]]

target = df['50k'].values
features = df[corr_col_to_keep].values

features_test = test[corr_col_to_keep].values
target_test = test['50k'].values

# Model
model = LogisticRegression(C=C)
model = model.fit(features, target)

# Scores
display_scores(features, target, features_test, target_test, model)

Train
----------
Score :  0.9522611428256391
Confusion matrix :  
 [[185279   7663]
 [  1862   4719]]
True positive :  0.3811177515748667


Dev
---------------
Score :  0.9529379924219643
Confusion matrix :  
 [[92689  3808]
 [  887  2378]]
True positive :  0.38441642418364047


### b - Random Forest Classifier

In [17]:
# Grid Search to find best parameters
# Parameters
to_keep_ = [40,50,60,70]
n_estimators_ = [14,50,100]
max_depth_ = [10,25,50]

scores = []

for to_keep in to_keep_:
    for n_estimators in n_estimators_:
        for max_depth in max_depth_:
            # Features
            corr_col_to_keep = [l[0] for l in corr_col[:to_keep]]

            features = df[corr_col_to_keep].values
            target = df['50k'].values

            features_test = test[corr_col_to_keep].values
            target_test = test['50k'].values

            # Model
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
            model = model.fit(features, target)

            # Scores
            scores.append((to_keep, n_estimators, max_depth, 
                           accuracy_score(model.predict(features), target),
                           precision_score(model.predict(features), target),
                           accuracy_score(model.predict(features_test),target_test),
                           precision_score(model.predict(features_test), target_test)))

In [18]:
# Select best parameters in function of True Positive rate (because of the distribution of 50k column)
best_params = sorted(scores, key=lambda tup: tup[-1], reverse=True)[0]
print(best_params)

(40, 100, 25, 0.9928078467144139, 0.8888709416895494, 0.9561356027345081, 0.44633042353701907)


In [19]:
# Parameters
to_keep = best_params[0]
n_estimators = best_params[1]
max_depth = best_params[2]

# Features
corr_col_to_keep = [l[0] for l in corr_col[:to_keep]]

target = df['50k'].values
features = df[corr_col_to_keep].values

features_test = test[corr_col_to_keep].values
target_test = test['50k'].values

# Model
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
model = model.fit(features, target)

# Scores
display_scores(features, target, features_test, target_test, model)

Train
----------
Score :  0.9923617828521023
Confusion matrix :  
 [[187078   1461]
 [    63  10921]]
True positive :  0.8820061379421741


Dev
---------------
Score :  0.9557847677472384
Confusion matrix :  
 [[92627  3462]
 [  949  2724]]
True positive :  0.44034917555771097
