In [37]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [38]:
X, y = fetch_openml('adult', version=1, return_X_y=True)

In [39]:
X.describe()

Unnamed: 0,fnlwgt,education-num
count,48842.0,48842.0
mean,189664.1,10.078089
std,105604.0,2.570973
min,12285.0,1.0
25%,117550.5,9.0
50%,178144.5,10.0
75%,237642.0,12.0
max,1490400.0,16.0


In [40]:
y.describe()

count     48842
unique        2
top       <=50K
freq      37155
Name: class, dtype: object

In [41]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  category
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  float64 
 3   education       48842 non-null  category
 4   education-num   48842 non-null  float64 
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capitalgain     48842 non-null  category
 11  capitalloss     48842 non-null  category
 12  hoursperweek    48842 non-null  category
 13  native-country  47985 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.3 MB


In [42]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 48842 entries, 0 to 48841
Series name: class
Non-Null Count  Dtype   
--------------  -----   
48842 non-null  category
dtypes: category(1)
memory usage: 47.9 KB


In [43]:
# check for null values
X.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capitalgain          0
capitalloss          0
hoursperweek         0
native-country     857
dtype: int64

In [44]:
y.isnull().sum()

0

In [45]:
# drop rows with missing
X = X.dropna()

In [46]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             45222 non-null  category
 1   workclass       45222 non-null  category
 2   fnlwgt          45222 non-null  float64 
 3   education       45222 non-null  category
 4   education-num   45222 non-null  float64 
 5   marital-status  45222 non-null  category
 6   occupation      45222 non-null  category
 7   relationship    45222 non-null  category
 8   race            45222 non-null  category
 9   sex             45222 non-null  category
 10  capitalgain     45222 non-null  category
 11  capitalloss     45222 non-null  category
 12  hoursperweek    45222 non-null  category
 13  native-country  45222 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.6 MB


In [47]:
# get categorical columns
cat_ix = X.select_dtypes(include=['category']).columns
print(cat_ix)

Index(['age', 'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capitalgain', 'capitalloss',
       'hoursperweek', 'native-country'],
      dtype='object')


In [48]:
num_ix = X.select_dtypes(include=['int64', 'float64']).columns
print(num_ix)

Index(['fnlwgt', 'education-num'], dtype='object')


In [49]:
encoder = LabelEncoder()
# now apply the transformation to all the columns:
tot = 0
for col in cat_ix:
    print(col, X[col].unique().size)
    tot += X[col].unique().size
    X[col] = encoder.fit_transform(X[col])
print()
print(tot)
X

age 5
workclass 7
education 16
marital-status 7
occupation 14
relationship 6
race 5
sex 2
capitalgain 5
capitalloss 5
hoursperweek 5
native-country 41

118


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,5,77516.0,9,13.0,4,0,1,4,1,1,0,2,38
1,3,4,83311.0,9,13.0,2,3,0,4,1,0,0,0,38
2,2,2,215646.0,11,9.0,0,5,1,4,1,0,0,2,38
3,3,2,234721.0,1,7.0,2,5,0,2,1,0,0,2,38
4,1,2,338409.0,9,13.0,2,9,5,2,0,0,0,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,1,2,245211.0,9,13.0,4,9,3,4,1,0,0,2,38
48837,2,2,215419.0,9,13.0,0,9,1,4,0,0,0,2,38
48839,2,2,374983.0,9,13.0,2,9,0,4,1,0,0,3,38
48840,2,2,83891.0,9,13.0,0,0,3,1,1,2,0,2,38


In [52]:
hotenc = OneHotEncoder(sparse=False)
print(len(cat_ix))
cat_features = hotenc.fit_transform(X[cat_ix])
cat_features.shape

12


(45222, 118)

In [56]:
num_features = StandardScaler().fit_transform(X[num_ix])
num_features.shape

(45222, 2)

In [57]:
y_new = encoder.fit_transform(y)
y_new

array([0, 0, 0, ..., 0, 0, 1])

In [59]:
X_new = np.concatenate((num_features, cat_features), axis=1)
X_new.shape

(45222, 120)