In [10]:
# dataframe and array
import pandas as pd
import numpy as np

# plot
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn
from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, precision_recall_curve,accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

# for hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, any_classifier

# for sampling
from imblearn.over_sampling import SMOTE


### model selection 

In [2]:
# load the testing dataset
from sklearn.datasets import load_iris
iris = load_iris()

In [3]:
# cross_val_score
clf = RandomForestClassifier()
cross_val_score(clf, iris.data, iris.target, cv=5) #5-fold cv

# 5折fold
# 默认输出score为准确率

array([ 0.96666667,  0.96666667,  0.93333333,  0.93333333,  1.        ])

In [None]:
# 可以用其他分数 如f1， auc
cross_val_score(clf, iris.data, iris.target, cv=5, scoring='f1')

In [12]:
# 扩展同时支持多个分数输出

from multiscorer import MultiScorer  #https://github.com/StKyr/multiscorer/
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [18]:
models = [GaussianNB(), DecisionTreeClassifier(), SVC()]
names = ["Naive Bayes", "Decision Tree", "SVM"]

scorer = MultiScorer({
    'Accuracy' : (accuracy_score, {}),
    'Precision' : (precision_score, {'pos_label': 3, 'average':'macro'}),
    'Recall' : (recall_score, {'pos_label': 3, 'average':'macro'})
})

for model, name in zip(models, names):
    print name
    start = time.time()

    cross_val_score(model, iris.data, iris.target,scoring=scorer, cv=10)
    results = scorer.get_results()

    for metric_name in results.keys():
        average_score = np.average(results[metric_name])
        scores = np.array(results[metric_name])
        print("%s: %0.5f (+/- %0.3f)" % (metric_name, scores.mean(), scores.std() * 2))
#         print('%s : %f' % (metric_name, average_score))

    print 'time', time.time() - start, '\n\n'

Naive Bayes
Recall: 0.95333 (+/- 0.085)
Precision: 0.96270 (+/- 0.065)
Accuracy: 0.95333 (+/- 0.085)
time 0.0720000267029 


Decision Tree
Recall: 0.95667 (+/- 0.087)
Precision: 0.96357 (+/- 0.075)
Accuracy: 0.95667 (+/- 0.087)
time 0.0639998912811 


SVM
Recall: 0.96444 (+/- 0.082)
Precision: 0.97016 (+/- 0.070)
Accuracy: 0.96444 (+/- 0.082)
time 0.0640001296997 




### sampling

In [21]:
# 基本split
print 'before split'
print iris.data.shape, iris.target.shape
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

print 'after split'
print 'train dataset:'
print X_train.shape, y_train.shape
print 'test dataset:'
print X_test.shape, y_test.shape

before split
(150L, 4L) (150L,)
after split
train dataset:
(90L, 4L) (90L,)
test dataset:
(60L, 4L) (60L,)
