In [2]:
#データのロード
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

#入力データ: x(30次元)
#正解データ: y
x = cancer.data
y = cancer.target

In [3]:
#サンプルデータの分割
random_seed =123

#データ分割のパラメータ(訓練データ:90%, 検証データ:10%)
test_size = 0.1

#データ分割
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size,
                                                    random_state=random_seed, stratify=y)
#分割後サイズ確認
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(569, 30)
(512, 30)
(57, 30)


In [7]:
#複数アルゴリズムで精度を比較
#結果が同じになるよう random_stateは同一にする
random_seed=123

#線形回帰
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=random_seed)

#サポートベクターマシン(カーネル)
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=random_seed)

#決定木
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=random_seed)

#ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=random_seed)

#XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=random_seed)

#アルゴリズムのリスト作成
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [8]:
#複数アルゴリズムで精度比較
for algorithm in algorithms:
    
    #訓練データで学習
    algorithm.fit(x_train, y_train)
    
    #検証データで精度鑑定
    score = algorithm.score(x_test, y_test)
    
    #アルゴリズム名取得
    name = algorithm.__class__.__name__
    
    #精度とアルゴリズム名表示
    print(f'score: {score:.4f} {name}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


score: 0.9649 LogisticRegression
score: 0.8947 SVC
score: 0.9474 DecisionTreeClassifier
score: 0.9298 RandomForestClassifier
score: 0.9825 XGBClassifier


In [9]:
#デフォルトのパラメータ値の確認
algorithm = SVC(kernel='rbf', random_state=random_seed)
print(algorithm)

SVC(random_state=123)


In [10]:
#gammaの最適化
gammas = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

for gamma in gammas:
    algorithm = SVC(kernel='rbf', gamma = gamma, random_state=random_seed)
    algorithm.fit(x_train, y_train)
    score = algorithm.score(x_test, y_test)
    print(f'score: {score:.4f} gamma: {gamma}')

score: 0.6316 gamma: 1
score: 0.6316 gamma: 0.1
score: 0.6316 gamma: 0.01
score: 0.9474 gamma: 0.001
score: 0.9474 gamma: 0.0001
score: 0.9474 gamma: 1e-05


In [11]:
#Cの最適化
#gammaは先ほど調べた最適値0.001を採用
Cs = [1, 10, 100, 1000, 10000]
for C in Cs:
    algorithm = SVC(kernel='rbf', gamma=0.001, C=C, 
                    random_state=random_seed)
    algorithm.fit(x_train, y_train)
    score = algorithm.score(x_test, y_test)
    print(f'score: {score:.4f} C: {C}')

score: 0.9474 C: 1
score: 0.9298 C: 10
score: 0.9298 C: 100
score: 0.9298 C: 1000
score: 0.9298 C: 10000


In [14]:
#特定のアルゴリズムに対して交差検定を実施

#アルゴリズムの定義
algorithm = SVC(kernel='rbf', random_state=random_seed, gamma=0.001, C=1)

#分割時に正解データの分布が偏らないようにStratifiedKFoldを使用
from sklearn.model_selection import StratifiedKFold
stratifiedkFold = StratifiedKFold(n_splits=3)

#交差検定の実施(分割数＝3)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(algorithm, x_train, y_train, cv=stratifiedkFold)

#平均値の計算
mean = scores.mean()

#結果確認
print(f'平均スコア: {mean:.4f} 個別スコア: {scores}')

平均スコア: 0.9141 個別スコア: [0.88888889 0.91812865 0.93529412]


In [16]:
#候補アルゴリズムのリスト作成

#線形回帰
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=random_seed)

#サポートベクターマシン(カーネル)
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=random_seed, gamma=0.001, C=1)

#決定木
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=random_seed)

#ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=random_seed)

#XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=random_seed)

#アルゴリズムのリスト作成
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [18]:
#複数アルゴリズムで精度を比較

#分割時に正解データの分布が偏らないようにStratifiedKFoldを利用
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score
for algorithm in algorithms:
    #交差検定法の実行
    scores = cross_val_score(algorithm, x_train, y_train,
                             cv=stratifiedkfold)
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'平均スコア: {score:.4f} 個別スコア: {scores} {name}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

平均スコア: 0.9453 個別スコア: [0.93567251 0.94736842 0.95294118] LogisticRegression
平均スコア: 0.9141 個別スコア: [0.88888889 0.91812865 0.93529412] SVC
平均スコア: 0.9062 個別スコア: [0.87134503 0.94152047 0.90588235] DecisionTreeClassifier
平均スコア: 0.9629 個別スコア: [0.96491228 0.95906433 0.96470588] RandomForestClassifier
平均スコア: 0.9570 個別スコア: [0.94736842 0.96491228 0.95882353] XGBClassifier
