## Census Income

Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset.

[data set](https://archive.ics.uci.edu/ml/datasets/Census+Income)

### Data preview

In [3]:
import pandas as pd
import numpy as np
import re


with open('../ds/adult.names') as fp:
    cols = [sre.group('colname') for line in fp
                if (sre := re.match(r'(?P<colname>[a-z\-]+):.*\.', line))]
    cols.append('label')

options = {'header': None, 'names': cols, 'skipinitialspace': True}

train_df = pd.read_csv('./ds/adult.data', **options)

test_df = pd.read_csv('./ds/adult.test', skiprows=1, **options)
test_df['label'] = test_df['label'].str.rstrip('.')

train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
train_df.shape

(32561, 15)

In [6]:
test_df.shape

(16281, 15)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  label           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
TEST_VAL_SPLIT = 0.5
VAL_SET_SIZE = int(len(test_df) * TEST_VAL_SPLIT)
VAL_SET_SIZE

8140

#### Labels

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train_str = train_df["label"]

le.fit(y_train_str)

y_train = le.transform(y_train_str)

y_train.shape

(32561,)

In [10]:
y_test_str = test_df.iloc[:VAL_SET_SIZE, :]["label"]

y_val_str = test_df.iloc[VAL_SET_SIZE:, :]["label"]

y_test = le.transform(y_test_str)

y_val = le.transform(y_val_str)

y_test.shape, y_val.shape

((8140,), (8141,))

#### Features

In [11]:
#drop labels
X_train = train_df.drop("label", axis=1)
X_test = test_df.iloc[:VAL_SET_SIZE, :].drop('label', axis=1)
X_val = test_df.iloc[VAL_SET_SIZE:, :].drop('label', axis=1)

X_val.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
8140,18,Private,216540,11th,7,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States
8141,28,Private,159623,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
8142,45,Federal-gov,87207,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,37,United-States
8143,57,Private,47621,9th,5,Married-civ-spouse,Other-service,Wife,White,Female,0,0,38,United-States
8144,35,Private,190297,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,1977,65,United-States


In [12]:
categorical_features = [
    'workclass', 
    'education',
     'marital-status', 
     'occupation', 
     'relationship',
     'race',
     'sex',
     'native-country'
     ]
continuous_features = [
    'age',
    'fnlwgt',
    'education-num', #according to dataset description this is continuous
    'capital-gain',
    'capital-loss',
    'hours-per-week'
]
print(
"is features and X_train columns length equal: ---",
len(X_train.columns) 
== len(categorical_features + continuous_features))

is features and X_train columns length equal: --- True


In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

ohe = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()


ct = ColumnTransformer([
    ('ohe_ct', ohe, categorical_features),
    ('scaler_ct', scaler, continuous_features)

], remainder='passthrough')


ct.fit(X_train)


X_train = ct.transform(X_train)
X_test = ct.transform(X_test)
X_val = ct.transform(X_val)

In [14]:
X_train[1].todense()

matrix([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.

In [15]:
X_train.shape

(32561, 108)

In [16]:
X_test.shape, X_val.shape

((8140, 108), (8141, 108))