### 对标签进行编码

In [1]:
# 为了让数据适应算法和库，我们必须对蚊子数据进行编码，将数据型的转换成数值型
# preprocessing.LabelEncoder:标签专用，能够将分类转换成分类数值
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.impute import SimpleImputer
data = pd.read_csv(r"Narrativedata.csv",index_col = 0)
data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())
data.dropna(axis =0,inplace = True)

# 要输入的标签，不是特征矩阵，所以允许一维
y = data.iloc[:,-1]

# 实例化
le = LabelEncoder()
# 导入数据
le = le.fit(y)
# trainsform接口调取数据
label = le.transform(y)

# 属性.classes_查看标签中究竟有多少类别
le.classes_

array(['No', 'Unknown', 'Yes'], dtype=object)

In [2]:
data.iloc[:,-1] = label
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [3]:
# 有如下的方式进行简写
from sklearn.preprocessing import LabelEncoder
data.iloc[:,-1] = LabelEncoder().fit_transform(data.iloc[:,-1])

### 对特征进行编码
> 将分类特征转换成分类数值

In [4]:
from sklearn.preprocessing import OrdinalEncoder
# 接口categoreies_对应LabelEncoder的接口classes_，一模一样的功能
data_= data.copy()
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [5]:
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [6]:
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])

data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,2
2,26.0,0.0,2.0,2
3,35.0,0.0,2.0,2
4,35.0,1.0,2.0,0


### 独热编码

In [14]:
from sklearn.preprocessing import OneHotEncoder

x= data.iloc[:,1:-1]
enc = OneHotEncoder(categories = 'auto').fit(x)
result = enc.transform(x).toarray()

result

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [15]:
enc.get_feature_names()

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [16]:
newdata = pd.concat([data,pd.DataFrame(result)],axis =1)
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,2.0,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,2.0,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,2.0,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0


In [18]:
newdata.drop(["Sex","Embarked"],axis =1,inplace = True)
newdata.columns = ["Age","Survived","Female","Male",
                   "Embarked_C","Embarked_Q","Embarked_S"]
newdata.head()

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,2.0,1.0,0.0,1.0,0.0,0.0
2,26.0,2.0,1.0,0.0,0.0,0.0,1.0
3,35.0,2.0,1.0,0.0,0.0,0.0,1.0
4,35.0,0.0,0.0,1.0,0.0,0.0,1.0
