### 1.2 估计器(sklearn机器学习算法的实现)
在sklearn中，估计器(estimator)是一个重要的角色，是一类实现了算法的API

#### 1、用于分类的估计器：
sklearn.neighbors k-近邻算法
sklearn.naive_bayes 贝叶斯
sklearn.linear_model.LogisticRegression 逻辑回归
sklearn.tree 决策树与随机森林
#### 2、用于回归的估计器：
sklearn.linear_model.LinearRegression 线性回归
sklearn.linear_model.Ridge 岭回归
#### 3、用于无监督学习的估计器
sklearn.cluster.KMeans 聚类

### 案例1：鸢尾花种类预测

In [47]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def knn_iris():
    """
    用knn算法对鸢尾花进行分类
    """
    # 1）获取数据
    iris = load_iris()
    
    # 2）划分数据集
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state = 22)
    
    # 3）特征工程
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    
    # 4）KNN算法预估器
    estimator = KNeighborsClassifier(n_neighbors=16)
    estimator.fit(x_train, y_train)
    
    # 5）模型评估
    #法1:直接比对真实值和预测值
    y_predict = estimator.predict(x_test)
    print("y_predict:\n", y_predict)
    print("y_predict == y_test\n", y_predict == y_test)
    
    #法2:计算准确率
    score = estimator.score(x_test, y_test)
    print("score:", score)
    
    
#对鸢尾花案例加上K值调优
def knn_iris_gscv():
    """
    用knn算法对鸢尾花进行分类,添加网格搜索和交叉验证
    """
    # 1）获取数据
    iris = load_iris()
    
    # 2）划分数据集
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state = 22)
    
    # 3）特征工程
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    
    # 4）KNN算法预估器
    estimator = KNeighborsClassifier()
    
    #添加网格搜索和交叉验证
    param_dict = {"n_neighbors":[1,3,5,7,9,11,13]}
    estimator = GridSearchCV(estimator,  param_grid = param_dict, cv=10)
    
    estimator.fit(x_train, y_train)
    
    # 5）模型评估
    #法1:直接比对真实值和预测值
    y_predict =  estimator.predict(x_test)
    print("y_predict:\n", y_predict)
    print("y_predict == y_test\n", y_predict == y_test)
    
    #法2:计算准确率 
    score = estimator.score(x_test, y_test)
    print("score:", score)
    
    #最佳参数 best_param_
    print("best_params_:", estimator.best_params_)
    #最佳结果 best_score_ 
    print("best_score_:", estimator.best_score_)
    #最佳估计值 best_estimator_
    print("best_estimator_:", estimator.best_estimator_)
    #交叉验证结果 cv_results_
    print("cv_results_:", estimator.cv_results_)
    
    

In [48]:
knn_iris()

y_predict:
 [0 2 1 2 1 1 1 2 1 0 2 1 2 2 0 2 1 1 1 1 0 2 0 1 2 0 2 2 2 2]
y_predict == y_test
 [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True]
score: 0.9666666666666667


In [49]:
knn_iris_gscv()

y_predict:
 [0 2 1 2 1 1 1 1 1 0 2 1 2 2 0 2 1 1 1 1 0 2 0 1 2 0 2 2 2 2]
y_predict == y_test
 [ True  True  True  True  True  True  True False  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True]
score: 0.9333333333333333
best_params_: {'n_neighbors': 5}
best_score_: 0.9666666666666666
best_estimator_: KNeighborsClassifier()
cv_results_: {'mean_fit_time': array([0.00070319, 0.00029962, 0.0007025 , 0.00050523, 0.00049438,
       0.0005981 , 0.00060277]), 'std_fit_time': array([0.00046046, 0.00045768, 0.00064673, 0.00050543, 0.00049453,
       0.00048875, 0.00049229]), 'mean_score_time': array([0.00109503, 0.0011003 , 0.00099375, 0.00118659, 0.00110552,
       0.0009963 , 0.00089726]), 'std_score_time': array([2.99924005e-04, 3.14113089e-04, 1.36463972e-05, 3.93445971e-04,
       2.96504393e-04, 3.84260933e-06, 5.36969256e-04]), 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9, 11, 13],
             m

### Facebook签到位置预测K值调优

In [40]:
import pandas as pd

In [None]:
#1获取数据
data = pd.read_csv("./dataset/FBlocation/train.csv")
#2基本的数据处理
#2.1缩小数据范围 "x > 2 & x < 2.5 & y > 1 & y < 1.5"
#data = data.query("x > 2 & x < 2.5 & y > 1 & y < 1.5")
data = data[:, :100000]

In [None]:
 #2.2处理时间特征
time_value = pd.to_datetime(data["time"], unit="s")
time_value.values

In [None]:
date = pd.DatetimeIndex(time_value)
date

In [None]:
data["day"] = date.day

In [None]:
data["weekday"] = date.weekday

In [None]:
data["hour"] = date.hour

In [None]:
#2.3过滤签到次数少的地点
place_count = data.groupby("place_id").count()["row_id"]
place_count[place_count > 3].head()

In [None]:
data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]

In [None]:
data_final

In [None]:
#筛选特征值和目标值
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# 3）特征工程
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4）KNN算法预估器
estimator = KNeighborsClassifier()

#添加网格搜索和交叉验证
param_dict = {"n_neighbors":[3,5,7,9]}
estimator = GridSearchCV(estimator,  param_grid = param_dict, cv=5)
    
estimator.fit(x_train, y_train)
    
# 5）模型评估
#法1:直接比对真实值和预测值
y_predict =  estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("y_predict == y_test\n", y_predict == y_test)
 
#法2:计算准确率 
score = estimator.score(x_test, y_test)
print("score:", score)

#最佳参数 best_param_
print("best_params_:", estimator.best_params_)
#最佳结果 best_score_ 
print("best_score_:", estimator.best_score_)
#最佳估计值 best_estimator_
print("best_estimator_:", estimator.best_estimator_)
#交叉验证结果 cv_results_
print("cv_results_:", estimator.cv_results_)

### 朴素贝叶斯

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [36]:
def nb_news():
    """
    朴素贝叶斯对新闻数据集进行预测
    :return:
    """
    # 获取新闻的数据，20个类别
    news = fetch_20newsgroups()

    # 进行数据集分割
    x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.3)
   
    # 对于文本数据，进行特征抽取
    tf = TfidfVectorizer()
    
    x_train = tf.fit_transform(x_train)
    # 这里打印出来的列表是：训练集当中的所有不同词的组成的一个列表
    print(tf.get_feature_names())
    #print(x_train.toarray())


    # 不能调用fit_transform
    x_test = tf.trainsform(x_test)

    # estimator估计器流程
    mlb = MultinomialNB(alpha=1.0)
    mlb.fit(x_train,  y_train)

    # 进行预测
    y_predict = mlb.predict(x_test)

    print("预测每篇文章的类别：",y_predict[:100])
    print("真实类别为：", y_test[:100])

    print("预测准确率为：", mlb.score(x_test, y_test))

    return None

In [37]:
nb_news()

HTTPError: HTTP Error 403: Forbidden

### 决策树
class sklearn.tree.DecisionTreeClassifier(criterion=’gini’, max_depth=None,random_state=None)

决策树分类器

    criterion:默认是’gini’系数，也可以选择信息增益的熵’entropy’
    max_depth:树的深度大小
    random_state:随机数种子
其中会有些超参数：max_depth:树的深度大小
其它超参数我们会结合随机森林讲解

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz


In [73]:
def decisionTree_iris():
    """
    决策树进行鸢尾花分类
    :return:
    """
    #获取数据
    iris = load_iris()
    #划分数据集
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state=22)
    #决策树预估器 (信息增益)
    estimator = DecisionTreeClassifier(criterion = "entropy")
    estimator.fit(x_train, y_train)
    
    #模型评估
    #法1:直接比对真实值和预测值
    y_predict = estimator.predict(x_test)
    print("y_predict:\n", y_predict)
    print("y_predict == y_test\n", y_predict == y_test)
    
    #法2:计算准确率
    score = estimator.score(x_test, y_test)
    print("score:", score)
    
    
    ### 决策树可视化  （http://webgraphviz.com/     将.dot文件内容复制到这个网站实现可视化）
    export_graphviz(estimator, out_file="iris_tree.dot", feature_names = iris.feature_names)
    

In [74]:
decisionTree_iris()

y_predict:
 [0 2 1 2 1 1 1 1 1 0 2 1 2 2 0 2 1 1 1 1 0 2 0 1 2 0 1 2 2 1]
y_predict == y_test
 [ True  True  True  True  True  True  True False  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True False  True  True False]
score: 0.8666666666666667


### 案例：泰坦尼克号乘客生存预测 (决策树)

In [136]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [137]:
#获取数据
path = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt"

train = pd.read_csv("dataset/taitanic/train.csv")
test = pd.read_csv("dataset/taitanic/test.csv")


#数据处理  缺失值处理  特征值—>字典类型

#准备好特征值、目标值
#划分数据集
#特征工程：字典特征抽取
#决策树预估器流程
#模型评估

In [138]:
train.info() #查看数据集整体情况

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [139]:
train.describe() #了解数据集的统计情况

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [140]:
train.describe(include=['O']) #查看非数字字段情况

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [141]:
#数据处理  缺失值处理  特征值—>字典类型

##训练集
#缺失值处理 Age Cabin Embarked
#Age中的空值可用平均年龄来填充
train["Age"].fillna(train["Age"].mean(), inplace = True)

#Cabin有大量的缺失值，在训练集和测试集中缺失率都比较高，无法补齐
#Embarked为登陆港口，可以根据港口属性补齐。可以看到港口为“S”类型的占比最高，
#train["Embarked"].value_counts()
#可以考虑把缺失的港口用“S”港口填充
train["Embarked"].fillna("S", inplace=True)


##测试集 Age、Fare、Cabin是有缺失数据的
#Age中的空值可用平均年龄来填充
test["Age"].fillna(test["Age"].mean(), inplace = True)
#Fare中的空值可用平均票价来填充
test["Fare"].fillna(test["Fare"].mean(), inplace = True)
#Cabin有大量的缺失值，在训练集和测试集中缺失率都比较高，无法补齐


In [142]:
#特征选择
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_features = train[features]
train_labels = train['Survived']
test_features = test[features]

In [143]:
#特征工程：字典特征抽取
#特征值有一些是字符串，不方便后续的运算，需要把它们转化为数值类型，

#Sex 有male和female两种类型，可以把它变成 Sex=male 和 Sex = female， 数值用0或1来表示
#Embarked 有S 、C 、Q三种类型，可以把它变成Embarked= S 、 Embarked= C、Embarked= Q，数值用0或1来表示
#可以使用sklearn 特征选择中的 DictVectorizer类，用它可以处理符号化的对象，将符号转化为数字0或1进行表示

dv = DictVectorizer(sparse = False)
train_features = dv.fit_transform(train_features.to_dict(orient="record"))

dv.feature_names_

  train_features = dv.fit_transform(train_features.to_dict(orient="record"))


['Age',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

In [144]:
#决策树预估器流程
clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(train_features, train_labels)

DecisionTreeClassifier(criterion='entropy')

In [145]:
#模型评估
test_features = dv.transform(test_features.to_dict(orient="record"))
pred_labels = clf.predict(test_features)
pred_labels

  test_features = dv.transform(test_features.to_dict(orient="record"))


array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [146]:
#在模型评估中，决策树提供了score函数可以直接得到准确率。
#但由于我们的测试集中并没有真实的生存状况的结果，
#只能使用训练集中的数 据进行模型评估

acc_decision_tree = round(clf.score(train_features, train_labels), 6)
acc_decision_tree

0.982043

In [147]:
#用K折交叉验证统计决策树分类器的准确率
import numpy as np
from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(clf, train_features, train_labels, cv = 10))

0.7756304619225968

### 随机森林
class sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, bootstrap=True, random_state=None, min_samples_split=2)

#### 随机森林分类器
    n_estimators：integer，optional（default = 10）森林里的树木数量120,200,300,500,800,1200
    criteria：string，可选（default =“gini”）分割特征的测量方法
    max_depth：integer或None，可选（默认=无）树的最大深度 5,8,15,25,30
    max_features="auto”,每个决策树的最大特征数量
        If "auto", then max_features=sqrt(n_features).
        If "sqrt", then max_features=sqrt(n_features) (same as "auto").
        If "log2", then max_features=log2(n_features).
        If None, then max_features=n_features.
    bootstrap：boolean，optional（default = True）是否在构建树时使用放回抽样
    min_samples_split:节点划分最少样本数
    min_samples_leaf:叶子节点的最小样本数
    
超参数：n_estimator, max_depth, min_samples_split,min_samples_leaf

### 泰坦尼克号乘客生存预测 (随机森林) (训练集和测试集用的是决策树的)

In [150]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

estimator = RandomForestClassifier()
#添加网格搜索和交叉验证

param_dict = {"n_estimators":[120,200,300,500,800,1200],
              "max_depth": [5, 8, 15, 25, 30]}
estimator = GridSearchCV(estimator,  param_grid = param_dict, cv=3)
    
estimator.fit(train_features, train_labels)

pred_labels = estimator.predict(test_features)
print("预测结果：", pred_labels)

# 5）模型评估

from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(estimator, train_features, train_labels, cv = 10))

# # 5）模型评估
# #法1:直接比对真实值和预测值
# y_predict =  estimator.predict(x_test)
# print("y_predict:\n", y_predict)
# print("y_predict == y_test\n", y_predict == y_test)
 
# #法2:计算准确率 
# score = estimator.score(x_test, y_test)
# print("score:", score)

预测结果： [0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 0 1 0 1 0 0 1 0 0 1]


0.826079900124844