In [1]:
from ucimlrepo import fetch_ucirepo 
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, Binarizer,KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

import pandas as pd

In [2]:
adult = fetch_ucirepo(id=2) 

In [3]:
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
df = X
df[">50k"] = np.array(y==">50K", dtype=int)

display(df.shape)
df = df.dropna()
df = df[df.workclass != "?"]
df = df[df.occupation != "?"]
display(df.shape)

#display(df.workclass.value_counts())
#display(df.occupation.value_counts())
#display(df.relationship.value_counts())
#display(df["marital-status"].value_counts())
#display(df.fnlwgt.value_counts())
#display(df["education"].value_counts())
#display(df["capital-loss"].describe())
#display(df["capital-gain"].describe())
#display(df["hours-per-week"].describe())

df = df.rename(columns=dict(fnlwgt="weight"))
df.columns

(48842, 15)

(45778, 15)

Index(['age', 'workclass', 'weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       '>50k'],
      dtype='object')

In [4]:
column_trans = ColumnTransformer(
    [
        ('label', "passthrough", [">50k"]),
        ('categories', OneHotEncoder(dtype=int,sparse_output=False, drop="first", min_frequency=1500), ['workclass', "marital-status", "sex", "occupation"]),
        ('', KBinsDiscretizer(n_bins=4) , ['education-num', "age"]),
        ('any', Binarizer(), ["capital-loss", "capital-gain"]),
        ('>41', Binarizer(threshold=41), ["hours-per-week"])        
    ],
    verbose_feature_names_out=True,)

column_trans.fit(df)

cols = column_trans.get_feature_names_out()
df_out = pd.DataFrame(column_trans.transform(df), columns=cols, dtype=int)
len(cols), cols, df_out.shape

(30,
 array(['label__>50k', 'categories__workclass_Private',
        'categories__workclass_Self-emp-inc',
        'categories__workclass_Self-emp-not-inc',
        'categories__workclass_State-gov',
        'categories__workclass_infrequent_sklearn',
        'categories__marital-status_Married-civ-spouse',
        'categories__marital-status_Never-married',
        'categories__marital-status_infrequent_sklearn',
        'categories__sex_Male', 'categories__occupation_Craft-repair',
        'categories__occupation_Exec-managerial',
        'categories__occupation_Handlers-cleaners',
        'categories__occupation_Machine-op-inspct',
        'categories__occupation_Other-service',
        'categories__occupation_Prof-specialty',
        'categories__occupation_Sales',
        'categories__occupation_Transport-moving',
        'categories__occupation_infrequent_sklearn', '__education-num_0.0',
        '__education-num_1.0', '__education-num_2.0',
        '__education-num_3.0', '__age_0

In [7]:
df_out.to_csv("../adult.csv", index=False)

X_ = df_out.loc[:, ~df_out.columns.isin(['label__>50k'])]
print(cross_val_score(LogisticRegression(), X_,df_out["label__>50k"]).mean())
df_out.describe()

0.8410575117765754


Unnamed: 0,label__>50k,categories__workclass_Private,categories__workclass_Self-emp-inc,categories__workclass_Self-emp-not-inc,categories__workclass_State-gov,categories__workclass_infrequent_sklearn,categories__marital-status_Married-civ-spouse,categories__marital-status_Never-married,categories__marital-status_infrequent_sklearn,categories__sex_Male,...,__education-num_1.0,__education-num_2.0,__education-num_3.0,__age_0.0,__age_1.0,__age_2.0,__age_3.0,any__capital-loss,any__capital-gain,>41__hours-per-week
count,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,...,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0,45778.0
mean,0.167111,0.736533,0.036874,0.083839,0.042925,0.031544,0.465922,0.32295,0.072611,0.675761,...,0.325724,0.294639,0.254227,0.23579,0.246035,0.249508,0.268666,0.04749,0.083905,0.303596
std,0.373079,0.440518,0.188453,0.27715,0.202689,0.174783,0.498843,0.467609,0.2595,0.468095,...,0.46865,0.455885,0.435431,0.424496,0.430704,0.432733,0.443271,0.212687,0.277248,0.459815
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
