![](http://osloyi5le.bkt.clouddn.com/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%B7%A5%E7%A8%8B%E5%B8%88banner.png)

# 特征工程操作示例

In [None]:
# 导入工具库
import pandas as pd

### 载入示例数据

In [None]:
# Titanic数据
df_train = pd.read_csv('train.csv')

In [None]:
df_train.head()

### 了解你的数据

In [None]:
df_train.info()

In [None]:
df_train.describe()

## 基本数据处理

### 0.缺失值填充

#### 可以用pandas的fillna函数

In [None]:
# 查询fillna函数
help(pd.DataFrame.fillna)

In [None]:
df_train['Age'].fillna(value=df_train['Age'].mean())

#### 可以使用sklearn的Imputer

In [None]:
from sklearn.preprocessing import Imputer

In [None]:
help(Imputer)

In [None]:
imp=Imputer(missing_values='NaN',strategy='mean',axis=0)

In [None]:
age = imp.fit_transform(df_train[['Age']].values)

## 常见特征工程操作

## 数值型

### 0.幅度变换

In [None]:
# 取对数等变换
import numpy as np
log_age = df_train['Age'].apply(lambda x:np.log(x))

In [None]:
# 幅度缩放、归一化等
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
age_trans = mm_scaler.fit_transform(df_train[['Fare']])

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
age_trans = std_scaler.fit_transform(df_train[['Fare']])

### 1.统计值

In [None]:
# 最大最小值
max_age = df_train['Age'].max()
min_age = df_train['Age'].min()

In [None]:
# 分位数
age_quarter_1 = df_train['Age'].quantile(0.25)
age_quarter_3 = df_train['Age'].quantile(0.75)

### 2.四则运算

In [None]:
df_train.loc[:,'family_size'] = df_train['SibSp']+df_train['Parch']+1

In [None]:
df_train.head()

### 3.高次特征与交叉特征

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree=2)
poly_fea = poly.fit_transform(df_train[['SibSp','Fare']])

In [None]:
poly_fea.shape

### 4.离散化

In [None]:
df_train.loc[:,'fare_cut'] = pd.cut(df_train['Fare'],5)

In [None]:
df_train.loc[:,'fare_qcut'] = pd.qcut(df_train['Fare'],5)

### 5.One-Hot encoding/独热向量编码

In [None]:
df_train.info()

In [None]:
embarked_oht = pd.get_dummies(df_train[['Embarked']])

In [None]:
embarked_oht.head()

In [None]:
fare_qcut_oht = pd.get_dummies(df_train[['fare_qcut']])

In [None]:
fare_qcut_oht.head()

## 时间型

### 6.日期处理

In [None]:
car_sales = pd.read_csv('car_data.csv')

In [None]:
car_sales.head()

In [None]:
car_sales.loc[:,'date'] = pd.to_datetime(car_sales['date_t'], format="")

In [None]:
car_sales.head()

In [None]:
car_sales.info()

### 7.取出关键时间信息

In [None]:
# 取出几月份
car_sales.loc[:,'month'] = car_sales['date'].dt.month

In [None]:
car_sales.head()

In [None]:
# 取出几号
car_sales.loc[:,'dom'] = car_sales['date'].dt.day

In [None]:
# 取出一年当中第几天
car_sales.loc[:,'doy'] = car_sales['date'].dt.dayofyear

In [None]:
# 取出星期几
car_sales.loc[:,'dow'] = car_sales['date'].dt.dayofweek

In [None]:
car_sales.head()

## 文本型

### 8.词袋模型

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()

In [None]:
corpus = [
        'This is the first document.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document?'
        ]

In [None]:
X = vectorizer.fit_transform(corpus)

In [None]:
vectorizer.get_feature_names()

In [None]:
X.toarray()

### 9.TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_X = tfidf_vectorizer.fit_transform(corpus)

In [None]:
tfidf_vectorizer.get_feature_names()

In [None]:
tfidf_X.toarray()

### 10.组合特征

In [None]:
df_train.head()

In [None]:
# 借助于条件判断实现
df_train.loc[:,'alone'] = (df_train['SibSp']==0)&(df_train['Parch']==0)

In [None]:
df_train.head()

## 特征选择

### 过滤式/Filter

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest

In [None]:
iris = load_iris()
X, y = iris.data, iris.target
X.shape

In [None]:
X_new = SelectKBest(k=2).fit_transform(X, y)
X_new.shape

### 包裹式/Wrapper

In [None]:
from sklearn.feature_selection import RFE

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rfe = RFE(estimator=rf, n_features_to_select=2)

In [None]:
X_rfe = rfe.fit_transform(X,y)

In [None]:
X_rfe.shape

### 嵌入式/Embedded

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

In [None]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)

In [None]:
model = SelectFromModel(lsvc, prefit=True)

In [None]:
X_embed = model.transform(X)

In [None]:
X_embed.shape