In [489]:
import pandas as pd

Read dataset

In [490]:
filename = "adult.csv"
df = pd.read_csv(filename)
data = df.values

# Data Cleaning

Remove duplicate data

In [491]:
print(df.drop_duplicates().values)

[[50 ' Self-emp-not-inc' 83311 ... 13 ' United-States' ' <=50K']
 [38 ' Private' 215646 ... 40 ' United-States' ' <=50K']
 [53 ' Private' 234721 ... 40 ' United-States' ' <=50K']
 ...
 [58 ' Private' 151910 ... 40 ' United-States' ' <=50K']
 [22 ' Private' 201490 ... 20 ' United-States' ' <=50K']
 [52 ' Self-emp-inc' 287927 ... 40 ' United-States' ' >50K']]


Handle observation missing data

In [492]:
df = df.replace(" ?", np.NaN)
print(df.shape) # before delete observation with missing data
df.dropna(subset=[' United-States'], axis=0, inplace=True)
print(df.values.shape) # after delete observation with missing data

(32560, 15)
(31977, 15)


Feature selection (Remove irrelevant data)

In [493]:
print(df.drop(columns=['39']).shape) # drop feature age and retain other features
print(df.shape)

(31977, 14)
(31977, 15)


# Data Transformation

Data trước khi transform

In [494]:
print(data[0])

[50 ' Self-emp-not-inc' 83311 ' Bachelors' 13 ' Married-civ-spouse'
 ' Exec-managerial' ' Husband' ' White' ' Male' 0 0 13 ' United-States'
 ' <=50K']


Type của các feature trước khi transform

In [495]:
names = list(df.columns)
types = df.dtypes
print(types)

39                 int64
 State-gov        object
 77516             int64
 Bachelors        object
 13                int64
 Never-married    object
 Adm-clerical     object
 Not-in-family    object
 White            object
 Male             object
 2174              int64
 0                 int64
 40                int64
 United-States    object
 <=50K            object
dtype: object


Type của các feature sau khi transform

In [496]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in range(len(types)):
    if types[i] == 'object':
        le.fit_transform(df[names[i]])
        df[names[i]] = le.transform(df[names[i]])

print(df.dtypes)

39                int64
 State-gov        int64
 77516            int64
 Bachelors        int64
 13               int64
 Never-married    int64
 Adm-clerical     int64
 Not-in-family    int64
 White            int64
 Male             int64
 2174             int64
 0                int64
 40               int64
 United-States    int64
 <=50K            int64
dtype: object


Data sau khi transform

In [497]:
data = df.values
print(data[0])

[   50     5 83311     9    13     2     3     0     4     1     0     0
    13    38     0]


# Data Normalization

Range value of feature age

In [498]:
print(min(df['39']))
print(max(df['39']))

17
90


Range value of feature fnlwgt

In [499]:
print(min(df[' 77516']))
print(max(df[' 77516']))

12285
1484705


Get features and predictable variable

In [500]:
X = data[:, :-1]
y = data[:, -1]

Data trước khi Normalization

In [501]:
print(X[0])

[   50     5 83311     9    13     2     3     0     4     1     0     0
    13    38]


Data sau khi Normalization

In [502]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
print(X_scaled[1])

[0.28767123 0.375      0.13811345 0.73333333 0.53333333 0.
 0.35714286 0.2        1.         1.         0.         0.
 0.39795918 0.95      ]
