### Book-Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition by Aurélien Géron

This is a note of the book ***Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition***

Author: Yujiang Peng

Date: Feb 12, 2020

##### 1. Date manipulation
* 1.1.载入数据、保存、查看
* 1.2.数据复制、分类、保存、组合、删除

##### 2.Certain functions
* 2.0.某些功能
* 2.1.下载、获取、保存、操作文件
* 2.2.

##### 3.Plot
* 3.1.柱状图
* 3.2.线图及散点图
* 3.

##### 4.Machine learning
* 4.1.数据预处理
  * 4.1.1.拆分数据集
  * 4.1.2.各列数据之间的相关性
  * 4.1.3.数据预处理流程
* 4.2.模型选择与预测
* 4.3.调整模型/调参
* 4.4.

### 1. Date manipulation
* 1.1.载入数据、保存、查看
* 1.2.数据复制、分类、保存、组合、删除

#### 1.0 NumPy

* NumPy Arrays, dtype, and shape
* Common Array Operations
* Reshape and Update In-Place
* Combine Arrays
* Create Sample Data

In [None]:
import numpy as np
import pandas as pd

In [None]:
# 前缀r表示原始字符串，在输入输出时非常实用
print(r'C:\nowhere')

In [None]:
import os
#将目录和文件名合成一个路径
datapath = os.path.join("FolderName", "LowerFolderName", "")

#### 1.1.载入数据、保存、查看

In [None]:
# 载入数据
Data_1 = pd.read_csv(datapath + "FileName.csv",thousands=',',delimiter='\t',
                             encoding='latin1', na_values="n/a")
# 或采用下列函数，可根据文件类型改写
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "FileName.csv")
    return pd.read_csv(csv_path)

In [None]:
# 保存到文件
Data_1.to_csv(os.path.join("FolderName", "LowerFolderName", "FileName.csv"))

In [None]:
# 查看前几行数据
Data_1.head(2)
Data_1.["ColumnName"].head()
# 查看数据列名称、数量、数据类型、文件大小
Data_1.info()
# 类别类型的数量
Data_1["ColumnName"].value_counts()
# 各列数据数量、均值、标准差、最大最小值、各分位数值
Data_1.describe()
# 查看数据维度
Data_1.shape
# 查看数据类型
Data_1.dtype

#### 1.2.数据复制、分类、保存、组合、删除

In [None]:
# 深度复制数据
Data_1 = Data_2.copy()

In [None]:
# 将某列数据划分类别
Data_1["NewColumnName"] = pd.cut(Data_1["ColumnName"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
# 两列数据之间操作，并保存在新列中
Data_1["NewColumnName"] = Data_1["ColumnName_1"]/Data_1["ColumnName_2"]

In [None]:
# 将单独各列数据根据索引组成在一起
DataFrame_1 = pd.DataFrame({
    "ColumnName_1": Data_1,
    "ColumnName_2": Data_2,
}).sort_index()

In [None]:
# 删除某行或某列数据
# note that drop() creates a copy of the data and does not affect set_
# drop() creates a copy of the data and does not affect strat_train_set
for set_ in (DataSet_1, DataSet_2):
    set_.drop("ColumnName", axis=1, inplace=True) # axis=1表示某列

In [None]:
# 找出有缺失值的数据，部分显示
sample_incomplete_rows = Data_1[Data_1.isnull().any(axis=1)].head()

In [None]:
# np.c_和np.r_
# np.r_是按列连接两个矩阵，就是把两矩阵上下相加，要求列数相等。
# np.c_是按行连接两个矩阵，就是把两矩阵左右相加，要求行数相等
np.c_[Data_1, Data_2]

In [None]:
# 改变数据形状，选择新的索引
Data_1 = Data_1.pivot(index="ColumnName", columns="IndicatorColumnName",\
                      values="Value")
# 重命名列
Data_2.rename(columns={"OriginalColumnName": "NewColumnName"}, inplace=True)
# 添加索引"index"
Data_1.reset_index()
# 更换索引
Data_1.set_index("ColumnName", inplace=True)
# 合并数据
Data_3 = pd.merge(left=Data_1, right=Data_2,
                                  left_index=True, right_index=True)
Data_3.sort_values(by="ColumnName", inplace=True)
# 保留或删除特定数据
remove_indices = [0, 1, 6, 8, 33, 34, 35]
keep_indices = list(set(range(36)) - set(remove_indices))
# .iloc根据索引index数值查找数据
Data_3[["ColumnName_1", 'ColumnName_2']].iloc[keep_indices]
# .loc根据索引index名称查找数据
Data_3[["ColumnName_1", 'ColumnName_2']].loc["RowName"]
Data_3.loc["RowName"]["ColumnName"]
Data_3.loc[[c for c in Data_.index if "W" in c.upper()]]["ColumnName"]
# 根据字典键值keys查找数据
Data_1.loc[list(Dict_1.keys())]

### 2.Certain functions
* 2.0.某些功能
* 2.1.下载、获取、保存、操作文件
* 2.2.

#### 2.0.某些功能

In [None]:
# to make this notebook's output identical at every run
np.random.seed(42)

#### 2.1.下载、获取、保存、操作文件

In [None]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("FolderName", "LowerFolderName")
HOUSING_URL = DOWNLOAD_ROOT + "FolderName/LowerFolderName/FileName.csv

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

### 3.Plot

* 3.1.柱状图
* 3.2.线图及散点图
* 3.

#### 3.1.柱状图

In [None]:
# 分别绘制各列数据的柱状图
%matplotlib inline
Data_1.hist(bins=50, figsize=(20,15))

In [None]:
# 根据某列数据绘制柱状图
Data_1["ColumnName"].hist()

#### 3.2.线图及散点图

In [None]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
# Visualize the data
# 这里的x和y是所绘图的坐标轴名称
ax = Data_1.plot(kind='scatter', x="1st_ColumnName", y='2ed_ColumnName',\
                 figsize=(5,3), alpha=0.1,\
           # 将第三列数据作为标识颜色的依据，并在右边绘制彩色标尺
            # s代表散点半径，由第三列数据确定
            s=Data_1["3rd_ColumnName"]/100, label="ZLabel", \
            # c代表散点颜色，由第四列数据确定
            c="4th_ColumnName-ScaleBarLabel", cmap=plt.get_cmap("jet"),\
                 colorbar=True, sharex=False)
# 插入图片作为背景
import matplotlib.image as mpimg
Insert_img=mpimg.imread(os.path.join(images_path, filename))
plt.imshow(Insert_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
Data_4th_Column = Data_1["4th_ColumnName"]
tick_values = np.linspace(Data_4th_Column.min(),Data_4th_Column.max(),Step_ScaleBar)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('ScaleBarLabel', fontsize=16)

# 设置轴线数据显示范围
plt.axis([0, 60000, 0, 10])
# 绘制参考线（根据方程绘制）
X=np.linspace(X_Start, X_end, X_Step)
plt.plot(X, f(X), "r--", label="LabelText")
# 绘制垂直于坐标轴的参考线
plt.plot([X_Start, X_Start], [0, Y_end], "r--")

In [None]:
# 添加文本，可添加多行
# theta_0表示有下标0
plt.text(X_Text_Position, Y_Text_Position, r"$\theta_0 = 0$", fontsize=14, color="r")
# 显示图例
plt.legend(loc="lower right", fontsize=16)
# 设置坐标轴名称标签，默认使用列名
plt.xlabel("XAxisLabel", fontsize=14)

In [None]:
# 添加指向具体数据点的箭头和文字
position_text = {
    "RowName_1": (X_Data_1, Y_Data_1),
    "RowName_2": (X_Data_2, Y_Data_2),
}
# Data_1应有两列数据
for Text_, X-Y_Data_ in Data_1.items():
    pos_data_x, pos_data_y = sample_data.loc[RowName_]
    # 更换文本
    RowName_ = "NewText_" if country == "Text_x" else RowName_
    plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,
            arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))
    # 将这些点绘制为红色
    plt.plot(pos_data_x, pos_data_y, "ro")

In [None]:
# 创建函数来保存图片，设置保存位置
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "fundamentals"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
# 保存图片，显示图片
save_fig('money_happy_scatterplot')
plt.show()

### 4.Machine learning
* 4.1.数据预处理
  * 4.1.1.拆分数据集
  * 4.1.2.各列数据之间的相关性
  * 4.1.3.数据预处理流程
* 4.2.模型选择与预测
* 4.3.调整模型/调参
* 4.4.

#### 4.1.数据预处理

##### 4.1.1.拆分数据集

In [None]:
# sklearn函数拆分数据，随机拆分
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(Data_1, test_size=0.2,random_state=42)

In [None]:
# 按各类数量等比例拆分
# 1. 先将某列数据划分类别
Data_1["ColumnName"] = pd.cut(Data_1["OldColumnName"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
# 2. 再在各个类别中分别拆分数据集
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(Data_1, Data_1["ColumnName"]):
    strat_train_set = Data_1.loc[train_index]
    strat_test_set = Data_1.loc[test_index]
# 3. 计算验证集中各类别比例  
strat_test_set["ColumnName"].value_counts() / len(strat_test_set)
# 4. 计算数据集中各类别比例
Data_1["ColumnName"].value_counts() / len(housing)

In [None]:
# For illustration only. Sklearn has train_test_split()
# 直接整块拆分
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
from zlib import crc32
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

import hashlib
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

# 自定义数据拆分方法
# 根据id拆分数据
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]
# 使用一：将索引作为id
housing_with_id = Data_1.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
# 使用二：id与具体数据关联
housing_with_id["id"] = Data_1["ColumnName_1"] * 1000 + Data_1["ColumnName_2"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

##### 4.1.2.各列数据之间的相关性

In [None]:
# 各列属性之间的标准相关系数
# the standard correlation coefficient (also called Pearson’s r)
corr_matrix = Data_1.corr()
# 各列属性与某一列属性之间的相关系数
corr_matrix["ColumnName"].sort_values(ascending=False)

In [None]:
# 另一种检查属性之间相关性的方法
# 绘制各列属性之间的散点图
from pandas.plotting import scatter_matrix
attributes = ["1st_ColumnName", "2ed_ColumnName", "3ed_ColumnName"]
scatter_matrix(Data_1[attributes], figsize=(12, 8))
# 也可单独绘制两列属性数据的散点图
Data_1.plot(kind="scatter", x="ColumnName_1", y="ColumnName_2", alpha=0.2)

##### 4.1.3数据预处理流程

In [None]:
# 0. 通过再次复制训练集恢复到清洗后的训练数据集，同时分离预测值与实际标签，
#    去掉训练集的实际标签
Data_1 = strat_train_set.drop("ColumnName_Label", axis=1)
Data_1_labels = strat_train_set["ColumnName_Label"].copy()
# 1. 数据清洗，data cleaning
#  1.1 找出有缺失值的属性
sample_incomplete_rows = Data_1[Data_1.isnull().any(axis=1)].head()
#  1.2 处理有缺失值的数值类属性
#    Option 1：直接删除有缺失值样本
sample_incomplete_rows.dropna(subset=["ColumnName_Incomplete"])
#    Option 2：直接删除有缺失值样本的属性
sample_incomplete_rows.drop("ColumnName_Incomplete", axis=1)
#    Option 3：用0、均值、中值等填充样本缺失值
median = Data_1["ColumnName_Incomplete"].median()
sample_incomplete_rows["ColumnName_Incomplete"].fillna(median, inplace=True)
#    Option 4：使用sklearn的imput()函数处理
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
#    去除非数值属性
Data_1_num = Data_1.drop("ColumnName_Inc_NonNumerical", axis=1)
# alternatively: Data_1_num = Data_1.select_dtypes(include=[np.number])
#    fit the imputer instances
imputer.fit(Data_1_num)
#    imputer computed the median of each attribute and stored the result
#    in its statistics_ instance variable.
#    查看具有缺失值属性的计算结果
imputer.statistics_
Data_1_num.median().values # 另一种方法计算的中值
#    用由imput计算得到的结果代替缺失值
X = imputer.transform(Data_1_num) # 结果为Numpy数组
#    转换为pandas的DataFrame
Data_1_tr = pd.DataFrame(X, columns=Data_1_num.columns, index=Data_1_num.index)
#    查看转换后的具有缺失值的样本
Data_1_tr.loc[sample_incomplete_rows.index.values]

#  1.3 处理有缺失值的类别性属性
Data_1_cat = Data_1[["ColumnName_Inc_Categorical"]]
#   将类别转换为数值
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
Data_1_cat_encoded = ordinal_encoder.fit_transform(Data_1_cat)
#    查看类别
ordinal_encoder.categories_
#   或者转换为onehot metrix，更好；结果为SciPy稀疏矩阵
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
Data_1_cat_1hot = cat_encoder.fit_transform(Data_1_cat)
#    可将稀疏矩阵转换为密矩阵
Data_1_cat_1hot.toarray()
#   或者在创建OneHotEncoder时使用sparse=False参数
cat_encoder = OneHotEncoder(sparse=False)
Data_1_cat_1hot = cat_encoder.fit_transform(Data_1_cat)

In [None]:
# 2. 定制transformer
#    功能：可执行cleanup操作或者结合某些特定的属性
#    步骤：创建一个类，再执行三个方法fit() (returning self), transform(), fit_transform()
#    
from sklearn.base import BaseEstimator, TransformerMixin
# 将TransformerMixin作为基类时不需要执行fit_transform()
# 将BaseEstimator作为基类且无带*参数时将获得两个额外的方法:get_params(),set_params()
# 添加组合属性
ColumnIndex_1, ColumnIndex_2, ColumnIndex_3 = 3,4,5 # indice为列序数
#  添加组合的属性
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_ColumnName_ToAdd = True): # no *args or **kargs
        # 这段程序中transformer超参数为add_ColumnName_ToAdd
        self.ColumnName_ToAdd = ColumnName_ToAdd
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        New_ColumnName = X[:, ColumnIndex_3] / X[:, ColumnIndex_4]
        if self.add_ColumnName_ToAdd:
            ColumnName_ToAdd = X[:, ColumnIndex_1] / X[:, ColumnIndex_2]
            return np.c_[X, New_ColumnName, ColumnName_ToAdd]
        else:
            return np.c_[X, New_ColumnName]
attr_adder = CombinedAttributesAdder(add_ColumnName_ToAdd=False)
Data_1_extra_attribs = attr_adder.transform(Data_1.values) # Numpy array
# 可通过下列代码动态地获取columnIndex
col_names = "ColumnName_1", "ColumnName_2", "ColumnName_3"
ColumnIndex_1, ColumnIndex_2, ColumnIndex_3 = [
    Data_1.columns.get_loc(c) for c in col_names] # get the column indices
# 将Numpy array恢复为DataFrame
Data_1_extra_attribs = pd.DataFrame(
    Data_1_extra_attribs,
    columns=list(Data_1.columns)+["ColumnName_1", "ColumnName_2"],
    index=Data_1.index)
Data_1_extra_attribs.head()

In [None]:
# 3. Feature scaling
# 两种常用的方法：min-max scaling; standardization
MinMaxScaler # 归一化，值为0-1；feature_range可更改范围
standardization # outliers离群值影响较小

In [None]:
# 4. Transformation Pipelines
# pipeline for the numerical attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ]) # 除最后一个外必须是transformer，即必须有fit_transform()方法
Data_1_num_tr = num_pipeline.fit_transform(Data_1_num)
# 一个同时适用于数字列和类别列的transformer
from sklearn.compose import ColumnTransformer
num_attribs = list(Data_1_num)# 获取各数值列名
cat_attribs = ["ColumnName_Categorical"] # 获取各类别列名
full_pipeline = ColumnTransformer([ # requires a list of tuples
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ]) # a name, a transformer, and a list of names (or indices) of columns
Data_1_prepared = full_pipeline.fit_transform(Data_1)
# drop去掉某列，passthrough不处理某列，若需其它不同的处理可设置remainder超参数

# A full pipeline with both preparation and prediction
full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("linear", LinearRegression())
    ])

#### 4.2.模型选择与预测

In [None]:
# 全流程实例
import sklearn.linear_model
# Select a model, 有上述import操作后可不加sklearn.
ModelName_1 = sklearn.linear_model.LinearRegression() # 线性回归
ModelName_2 = sklearn.linear_model.Ridge(alpha=10**9.5) #
ModelName_3 = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3) # KNN
ModelName_4 = DecisionTreeRegressor(random_state=42) # 决策树
from sklearn.ensemble import RandomForestRegressor
ModelName_5 = RandomForestRegressor(n_estimators=100, random_state=42)
from sklearn.svm import SVR
ModelName_6 = SVR(kernel="linear")
# 数据预处理
Data_1_prepared = full_pipeline.transform(Data_1)
# Train the model
ModelName_.fit(X_Data_1, Y_Data_1)
# 模型参数（线性）
t0, t1 = ModelName_.intercept_[0], ModelName_.coef_[0][0]
# Make a prediction
X_new = [[22587]]  # X value to predict
ModelName_.predict(X_new) # predict
ModelName_.predict([[X_Value]])[0][0]

In [None]:
from sklearn import preprocessing
from sklearn import pipeline
from sklearn.linear_model import LinearRegression

poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False)
scaler = preprocessing.StandardScaler()
lin_reg = linear_model.LinearRegression()

pipeline_reg = pipeline.Pipeline([('poly',poly),('scal',scaler),('lin',lin_reg)])
pipeline_reg.fit(Xfull, yfull)
curve = pipeline_reg.predict(X[:, np.newaxis])
plt.plot(X, curve)

In [None]:
# 训练集的训练与评估
# 计算预测结果标准差
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(Data_labels, Data_predictions)
lin_rmse = np.sqrt(lin_mse)
# 计算预测结果绝对误差
from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(Data_labels, Data_predictions)

In [None]:
# 更好的评估方法：k折交叉验证
#  将训练集拆分为更小的训练集和验证集，使用train_test_split()函数
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, Data_prepared, Data_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

#### 4.3.调整模型/调参

In [3]:
# 几种调参方法

In [None]:
# 0. 搜索中各超参数组合的分数
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
#  将分数转换为DataFrame
pd.DataFrame(grid_search.cv_results_)

In [1]:
# 1.网格搜索 Grid search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(Data_prepared, Data_labels)
# 查看最优参数组合
grid_search.best_params_
grid_search.best_estimator_

In [None]:
# 2. 随机搜索 Randomized Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }
forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(Data_prepared, Data_labels)

In [None]:
# 3. 组合方法
#  通过组合来调整模型

In [None]:
# 4. 分析最优模型及其误差
#  各属性的相对重要性
feature_importances = grid_search.best_estimator_.feature_importances_

#  重要性分数
extra_attribs = ["ColumnName_1", "ColumnName_2", "ColumnName_3"]
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
# 5. 在验证集上评估模型
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("ColumnName_toPredict", axis=1)
y_test = strat_test_set["ColumnName_toPredict"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

# 计算t检验95%置信区间
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))
# 手动计算t检验95%置信区间
m = len(squared_errors)
mean = squared_errors.mean()
tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)
tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)

# 使用z检验
zscore = stats.norm.ppf((1 + confidence) / 2)
zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)

### 5.分类
* 5.1. 二元分类
* 5.2. 性能度量
  * 5.2.1. 通过交叉验证测量准确率
  * 5.2.2. 混淆矩阵confusion matrix
  * 5.2.3. 准确度和recall
  * 5.2.4. 权衡准确度和recall
  * 5.2.5. ROC曲线
* 5.3. 多元分类
* 5.4. 误差分析
* 5.5. 多标签分类
* 5.6. 多输出分类

#### 5.1. 二元分类

In [None]:
# Stochastic Gradient Descent (SGD)分类器
# 优点：可高效处理大数据集
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train, y_train_5)

# 预测
sgd_clf.predict([some_digit])

#### 5.2. 性能度量（二元分类）

In [None]:
# 通过交叉验证测量准确率
# 进行k折检验
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

### 准确率通常并不是分类器理想的性能度量，尤其是处理偏斜样本集时skewed datasets

In [None]:
# 混淆矩阵confusion matrix

# 生成步骤
# 1. 进行k折检验，返回每折验证集上的预测值
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
# 2. 获得混淆矩阵
#     其中，每行代表实际类别，每列代表预测得到的类别
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)

In [None]:
# 准确度和recall

from sklearn.metrics import precision_score, recall_score
# 计算准确度
precision_score(y_train_5, y_train_pred)
# 计算recall
recall_score(y_train_5, y_train_pred)

# 计算F_1分数，即准确度与recall的调和平均值
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

In [None]:
# 权衡准确度和recall

# 1.计算分数
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")
# 2.计算所有阈值下对应的准确度与recall
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
# 3.绘制准确度、recall与阈值的函数图像，或准确度与recall的函数图像

# 计算准确度大于0.9时的recall
recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
# 计算准确度大于0.9时的阈值
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

### 绘制准确度、recall与阈值的函数图像
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) # Not shown in the book
    plt.xlabel("Threshold", fontsize=16)        # Not shown
    plt.grid(True)                              # Not shown
    plt.axis([-50000, 50000, 0, 1])             # Not shown
plt.figure(figsize=(8, 4))                                                                  # Not shown
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:")                 # Not shown
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:")                                # Not shown
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")# Not shown
plt.plot([threshold_90_precision], [0.9], "ro")                                             # Not shown
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                             # Not shown
save_fig("precision_recall_vs_threshold_plot")                                              # Not shown
plt.show()

### 绘制准确度与recall的函数图像
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.plot([recall_90_precision, recall_90_precision], [0., 0.9], "r:")
plt.plot([0.0, recall_90_precision], [0.9, 0.9], "r:")
plt.plot([recall_90_precision], [0.9], "ro")
save_fig("precision_vs_recall_plot")
plt.show()

In [None]:
# ROC曲线

# 1.比较二元分类器的第一种方法
## 计算各种不同阈值下的TPR和FPR值
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

## 绘制FPR与TPR的函数图像
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    # Not shown in the book
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
    plt.grid(True)                                            # Not shown
plt.figure(figsize=(8, 6))                                    # Not shown
plot_roc_curve(fpr, tpr)
fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)]           # Not shown
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")   # Not shown
plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], "r:")  # Not shown
plt.plot([fpr_90], [recall_90_precision], "ro")               # Not shown
save_fig("roc_curve_plot")                                    # Not shown
plt.show()

# 2.比较分类器的另一种方法
## 测量曲线下的面积
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

## 计算随机森林分类器的ROC曲线和ROC AUC分数
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")

#### 5.3. 多元分类

#### 5.4. 多标签分类

#### 5.5. 误差分析

#### 5.6. 多输出分类