In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 特征工程(Feature Engineering)

## 分类变量(Categorical Variables)

美国成年人收入数据集，该数据集来自1994年美国人口普查数据库。

任务是，利用该数据集预测工人的收入是高于50000美元还是低于50000美元。

In [3]:
data = pd.read_csv(
    "datasets/adult.data", header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])

data = data[['age', 'workclass', 'education', 
             'gender', 'hours-per-week',
             'occupation', 'income']]

data.head()

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


#### One-Hot编码 (哑变量dummy variables)

In [5]:
data.workclass.value_counts()

workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

分类变量(categorial variable)含有多个项目，将这些项目分别作为$0$和$1$值的哑变量(虚拟变量, dummy variable)。

| workclass      | Governement Employee | Private Employee | Self Employed | Self Employed Incorporated |
| -------------- | -------------------- | -----------------| ------------- | -------------------------- |
| Governement Employee      | 1 | 0 | 0 | 0  |
| Private Employee          | 0 | 1 | 0 | 0  |
| Self Employed             | 0 | 0 | 1 | 0  |
| Self Employed Incorporated| 0 | 0 | 0 | 1  |

在统计学中，为了避免共线问题，将含有$k$项的分类变量，转换为$k-1$个哑变量

| workclass      | Private Employee | Self Employed | Self Employed Incorporated |
| -------------- | -----------------| ------------- | -------------------------- |
| Governement Employee      | 0 | 0 | 0  |
| Private Employee          | 1 | 0 | 0  |
| Self Employed             | 0 | 1 | 0  |
| Self Employed Incorporated| 0 | 0 | 1  |

In [5]:
data.describe()

Unnamed: 0,age,hours-per-week
count,32561.0,32561.0
mean,38.581647,40.437456
std,13.640433,12.347429
min,17.0,1.0
25%,28.0,40.0
50%,37.0,40.0
75%,48.0,45.0
max,90.0,99.0


In [6]:
data['gender'].value_counts()

gender
Male      21790
Female    10771
Name: count, dtype: int64

In [7]:
data['workclass'].value_counts()

workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

In [8]:
data['education'].value_counts()

education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64

In [9]:
data['occupation'].value_counts()

occupation
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: count, dtype: int64

In [11]:
data['income'].value_counts()

income
<=50K    24720
>50K      7841
Name: count, dtype: int64

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   education       32561 non-null  object
 3   gender          32561 non-null  object
 4   hours-per-week  32561 non-null  int64 
 5   occupation      32561 non-null  object
 6   income          32561 non-null  object
dtypes: int64(2), object(5)
memory usage: 1.7+ MB


In [12]:

data_dummies = pd.get_dummies(data)

data_dummies.columns

Index(['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov',
       'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th',
       'education_ 11th', 'education_ 12th', 'education_ 1st-4th',
       'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th',
       'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors',
       'education_ Doctorate', 'education_ HS-grad', 'education_ Masters',
       'education_ Preschool', 'education_ Prof-school',
       'education_ Some-college', 'gender_ Female', 'gender_ Male',
       'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces',
       'occupation_ Craft-repair', 'occupation_ Exec-managerial',
       'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners',
       'occupation_ Machine-op-inspct', 'occupation_ Other-service',
   

In [8]:
data_dummies.head(n=5)

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [9]:

features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']

X = features.values
y = data_dummies['income_ >50K'].values

print(f"X.shape: {X.shape}")
print(f"y.shape: {y.shape}")

X.shape: (32561, 44)
y.shape: (32561,)


In [11]:
X.shape

(32561, 44)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

logreg = LogisticRegression(solver="lbfgs", max_iter=2000)

logreg.fit(X_train, y_train)

print(f"Train score: {logreg.score(X_train, y_train):.2f}")
print(f"Test score: {logreg.score(X_test, y_test):.2f}")

Train score: 0.81
Test score: 0.81


> * 在train_test_split之前，完成get_dummies()，避免训练集和测试集形成不同的特征。

###  将数字转换成字符串，进行分类变量编码

一般情况，`get_dummies()`只把非连续的值当作分类变量


如下，`Integer Feature`作为整型变量，get_dummies默认不对此进行分类变量转换。

In [14]:

demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1],
                        'Categorical Feature': ['socks', 'fox', 'socks', 'box']})

demo_df

Unnamed: 0,Integer Feature,Categorical Feature
0,0,socks
1,1,fox
2,2,socks
3,1,box


In [15]:
pd.get_dummies(demo_df)

Unnamed: 0,Integer Feature,Categorical Feature_box,Categorical Feature_fox,Categorical Feature_socks
0,0,False,False,True
1,1,False,True,False
2,2,False,False,True
3,1,True,False,False


将整型转变为字符串，强制进行转换

In [17]:
demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)

pd.get_dummies(demo_df, columns=['Integer Feature', 'Categorical Feature']).astype(int)

Unnamed: 0,Integer Feature_0,Integer Feature_1,Integer Feature_2,Categorical Feature_box,Categorical Feature_fox,Categorical Feature_socks
0,1,0,0,0,0,1
1,0,1,0,0,1,0
2,0,0,1,0,0,1
3,0,1,0,1,0,0
