In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
df = pd.read_csv("data/adultcensus_data/adult.data", sep=",", header=None)

cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'target']

df.columns = cols

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# df.info()
df[df['age'] == 67].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
346,67,?,36135,11th,7,Married-civ-spouse,?,Husband,White,Male,0,0,8,United-States,<=50K
474,67,Private,49401,Assoc-voc,11,Divorced,Other-service,Not-in-family,White,Female,0,0,24,United-States,<=50K
534,67,Private,231559,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,20051,0,48,United-States,>50K
917,67,Self-emp-inc,76860,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  target          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Summary of the dataset shows that there are no missing values. But the preview shows that the dataset contains values coded as `?`. So, I will encode `?` as NaN values.

In [6]:

df.replace(r"[?]", np.nan, regex=True, inplace=True)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  31978 non-null  object
 14  target          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Now, the summary shows that the variables - workclass, occupation and native.country contain missing values. All of these variables are categorical data type. So, I will impute the missing values with the most frequent value- the mode.

In [8]:
# Impute missing values with mode
for col in ["workclass", "occupation", "native-country"]:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
target            0
dtype: int64

In [9]:
# Setting feature vector and target vector

X = df.drop(['target'], axis=1)
y = df['target']

X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [10]:
# Splitting data 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Feature engineering

In [11]:
# Encoded categorical variabels

categorical =  ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for feat in categorical:
    le = LabelEncoder()
    X_train[feat] = le.fit_transform(X_train[feat])
    X_test[feat] = le.transform(X_test[feat])

# Feature Scalling

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X.columns)


In [12]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,0.470399,-0.082194,-0.174981,0.179166,-0.414963,-0.409687,-1.036632,-0.898445,0.396164,0.701715,0.826332,-0.216969,1.589744,0.25886
1,0.617346,2.631509,-0.763517,0.179166,-0.414963,-1.738219,-1.539627,1.591127,0.396164,-1.425079,0.098582,-0.216969,-0.200583,0.25886
2,0.69082,-0.082194,-0.017034,1.216323,-0.025832,-0.409687,1.226844,-0.898445,0.396164,0.701715,-0.145156,-0.216969,0.775959,0.25886
3,-0.705178,-0.082194,-0.415016,0.179166,-0.414963,0.918845,-1.036632,-0.276052,0.396164,0.701715,-0.145156,-0.216969,-0.037826,-4.026487
4,-1.146019,-0.082194,0.130127,-1.37657,-2.36062,0.918845,-1.036632,-0.276052,0.396164,0.701715,-0.145156,-0.216969,-1.258503,-4.35613


### Logistics Regression model with and without PCA

In [13]:
##  Logistic Regression model with all features

from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import accuracy_score

lg = LogisticRegression()
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)

print(f"Logistic Regression acc score with all the features {accuracy_score(y_test, y_pred)}")

Logistic Regression acc score with all the features 0.8231139318251612


In [14]:
## Logistic Regression with PCA

from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)

cumsum = np.cumsum(pca.explained_variance_ratio_)
dim = np.argmax(cumsum >= 0.90) + 1
print('The number of dimensions required to preserve 90% of variance is',dim)

pca.explained_variance_ratio_


The number of dimensions required to preserve 90% of variance is 12


array([0.14797863, 0.10226579, 0.08030471, 0.07857565, 0.07410548,
       0.0733822 , 0.07023229, 0.0680419 , 0.06519202, 0.06103126,
       0.0605075 , 0.0485894 , 0.04249813, 0.02729505])

In [15]:
# drop last 2 features as it carries little information

X = df.drop(['target', 'native-country', 'hours-per-week'], axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex']

for feature in categorical:
        le = LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_test[feature] = le.transform(X_test[feature])


X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

lg = LogisticRegression()
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)

print(f"Logistic Regression acc score with all the features {accuracy_score(y_test, y_pred)}")


Logistic Regression acc score with all the features 0.8246494011669567


### Save npy file for X, y

In [29]:
categorical =  ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex']
for feat in categorical:
    le = LabelEncoder()
    X[feat] = le.fit_transform(X[feat])

le = LabelEncoder()
y = le.fit_transform(y)

# Feature Scalling

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

X_save = X.to_numpy()
y_save = y
sensitive_feat = X.sex.to_numpy() # Choose gender as sensitive feature

np.save("data/adultcensus_data/XS_adult.npy", X_save)
np.save("data/adultcensus_data/y_adult.npy", y_save)
np.save("data/adultcensus_data/s_adult.npy", sensitive_feat)