In [1]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Flatten, Dense, Conv2D

In [2]:
adult_df = pd.read_csv('../data/adult.csv')

In [4]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
adult_df.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [8]:
adult_df = adult_df.drop(['education.num'], axis = 1)

In [10]:
label = adult_df['income']
adult_df = adult_df.drop(['income'], axis = 1)

In [27]:
numerical_col = selector(dtype_exclude = object)(adult_df)
categorical_col = selector(dtype_include = object)(adult_df)

In [13]:
categorical_col

['workclass',
 'education',
 'marital.status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native.country']

In [28]:
ct = ColumnTransformer(
    [
        ('1-hot-encode', OneHotEncoder(handle_unknown = 'ignore'), categorical_col),
        ('std_scaler', StandardScaler(), numerical_col)
    ],
    remainder = 'passthrough'
)

In [15]:
adult_df_processed = pd.DataFrame(ct.fit_transform(adult_df).toarray())

In [17]:
X_train, X_test, y_train, y_test = train_test_split(adult_df_processed, label, test_size = 0.2)

In [22]:
model = LogisticRegression(max_iter = 500, class_weight = 'balanced')

In [23]:
cv_score = cross_val_score(model, X_train, y_train, cv = 5)

In [24]:
cv_score

array([0.80806142, 0.80959693, 0.81132438, 0.80245729, 0.8187752 ])

In [18]:
model = LogisticRegression(max_iter = 500, class_weight = 'balanced')
model.fit(X_train, y_train)

In [19]:
model.score(X_test, y_test)

0.8119146322739137

In [31]:
adult_df = pd.read_csv('../data/adult.csv')
adult_df = adult_df.drop(['education.num'], axis = 1)
label = adult_df['income']
adult_df = adult_df.drop(['income'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(adult_df, label, test_size = 0.2)

In [34]:
model = make_pipeline(ct, LogisticRegression(max_iter = 500, class_weight = 'balanced'))

In [35]:
model.fit(X_train, y_train)

In [36]:
model.score(X_test, y_test)

0.8157531091662829