### 1. 乳腺癌案例

In [19]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn import datasets
from sklearn.metrics import accuracy_score  # 准确率

import warnings
warnings.filterwarnings("ignore")

#### 1.1 加载数据

In [7]:
from sklearn.model_selection import train_test_split
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

#### 1.2 决策树

In [17]:
acore = 0
for i in range(100):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train) 
    y_ = model.predict(X_test)   # 预测 使用
    acore += accuracy_score(y_test, y_)
acore

93.14035087719286

#### 1.3 随机森林

In [13]:
acore = 0
for i in range(100):
    model = RandomForestClassifier()
    model.fit(X_train, y_train) 
    y_ = model.predict(X_test)   # 预测 使用
    acore += accuracy_score(y_test, y_)
acore

97.30701754385959

#### 1.3 AdaBoost算法

In [16]:
acore = 0
for i in range(100):
    model = AdaBoostClassifier()
    model.fit(X_train, y_train) 
    y_ = model.predict(X_test)   # 预测 使用
    acore += accuracy_score(y_test, y_)
acore

95.6140350877192

In [21]:
model = XGBClassifier()
model.fit(X_train, y_train) 
y_ = model.predict(X_test)   # 预测 使用
accuracy_score(y_test, y_)

0.9736842105263158

### 2.手写数字多分类案例

In [57]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### 2.1 加载数据 

In [58]:
X, y = datasets.load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1024)
for arr in (X_train, X_test, y_train, y_test):
    print(arr.shape, end=" ")

(1437, 64) (360, 64) (1437,) (360,) 

#### 2.2 决策树

In [59]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.8222222222222222

#### 2.2 随机森林

In [60]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.9805555555555555

#### 2.3 AdaBoost

In [61]:
model = AdaBoostClassifier()
model.fit(X_train, y_train)

y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.24166666666666667

In [62]:
X.shape

(1797, 64)

#### 2.4 剔除冗余特征

In [63]:
index = []
for i in range(64):
    if len(np.unique(X[:,i])) > 5:
        index.append(i)
len(index)

54

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X[:, index], y, test_size=0.2, random_state=1024)
model = AdaBoostClassifier()
model.fit(X_train, y_train)

y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.24166666666666667

#### 2.5 PCA降维

In [120]:
from sklearn.decomposition import PCA

In [121]:
X, y = datasets.load_digits(return_X_y=True)
pca = PCA(0.95, whiten=False)
X3 = pca.fit_transform(X)
X3.shape

(1797, 29)

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state=1024)

In [123]:
model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.19444444444444445

In [124]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.9777777777777777

In [125]:
from sklearn.linear_model import LogisticRegression

In [126]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.95

In [127]:
model = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1024)
model.fit(X_train, y_train)
y_ = model.predict(X_test)
accuracy_score(y_test, y_)

0.9694444444444444