In [1]:
# 資料處理使用
import pandas as pd
import numpy as np

# 模型驗證
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# 模型演算法
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
import os

In [3]:
os.chdir('C:/Anaconda3/envs/Trainpython/Lib/site-packages/')

In [4]:
from xgboost.sklearn import XGBClassifier

# 資料讀取

In [5]:
# 資料讀入dataframe
iris = pd.read_csv('C:/Users/hsu/Desktop/DSLab/testpy/iris.csv')
df = iris.copy()

In [15]:
pd.DataFrame(st.fit_transform(X), columns=X.columns)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
5,-0.537178,1.939791,-1.169714,-1.052180
6,-1.506521,0.788808,-1.340227,-1.183812
7,-1.021849,0.788808,-1.283389,-1.315444
8,-1.748856,-0.362176,-1.340227,-1.315444
9,-1.143017,0.098217,-1.283389,-1.447076


In [6]:
# 資料切分
y = df['Species']
X = df.drop(['Species'], axis = 1)

# 資料標準化
st = StandardScaler()
X_s = pd.DataFrame(st.fit_transform(X), columns=X.columns)

# 資料切分
X_train, X_test, y_train, y_test = train_test_split(X_s, y, test_size = 0.3, random_state = 123)

# 建立模型_Decision

In [22]:
# DecisionTree建模
declf = DecisionTreeClassifier()
declf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [39]:
# DecisionTree預測
dey_pred = declf.predict(X_test)
# print('Pred = {}; Len = {}'.format(dey_pred, len(dey_pred)))
# pd.DataFrame(dey_pred, columns=['Pred'])

In [67]:
# DecisionTree performs評估
print(confusion_matrix(y_test, dey_pred))
print(accuracy_score(y_test, dey_pred))
print(precision_score(y_test, dey_pred, average='macro'))
print(recall_score(y_test, dey_pred, average='macro'))
# pd.DataFrame({'A':y_test,'Pred':dey_pred})

[[18  0  0]
 [ 0 10  0]
 [ 0  2 15]]
0.9555555555555556
0.9444444444444445
0.9607843137254902


In [86]:
# 方法一：手動交互驗證
method = ['gini', 'entropy']
eva = pd.DataFrame()
for i in method:
    dt = DecisionTreeClassifier(criterion=i)
    test = cross_val_score(dt, X_train, y_train, cv = 10, scoring='accuracy')
    eva = pd.concat([eva,pd.DataFrame({str(i):test})], axis = 1)
print(eva.gini.mean())
print(eva.entropy.mean())

0.9600000000000002
0.9600000000000002


In [105]:
# 方法二：GridSearchCV
dt = DecisionTreeClassifier()
parameters = {'criterion':['entropy', 'gini']}
grid_search = GridSearchCV(estimator=dt, param_grid=parameters, cv=10, scoring='accuracy')
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'entropy'}
0.9619047619047619


# 建立模型_Logistic

In [5]:
# Logistic建模
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [6]:
# Logistic預測
lry_pred = lr.predict(X_test)

In [10]:
# Logistic模型評估
print(confusion_matrix(y_test, lry_pred))
print(accuracy_score(y_test, lry_pred))
print(f1_score(y_test, lry_pred, average='macro'))
print(precision_score(y_test, lry_pred, average='macro'))
print(recall_score(y_test, lry_pred, average='macro'))

[[18  0  0]
 [ 0 10  0]
 [ 0  2 15]]
0.9555555555555556
0.9488636363636364
0.9444444444444445
0.9607843137254902


In [17]:
# GridSearchCV
lr = LogisticRegression()
parameters = {'penalty':['l1','l2']}
grid_search = GridSearchCV(estimator=lr,
                           param_grid=parameters,
                           cv=10,
                           scoring='accuracy')
grid_search = grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)



{'penalty': 'l1'}
0.9333333333333333


# 建立模型_SVM

In [28]:
# SVM建模
svmclf = SVC(kernel='linear')
svmclf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
# SVM預測
svmy_pred = svmclf.predict(X_test)

In [30]:
# 模型評估
print(confusion_matrix(y_test, svmy_pred))
print(accuracy_score(y_test, svmy_pred))
print(f1_score(y_test, svmy_pred, average='macro'))
print(precision_score(y_test, svmy_pred, average = 'macro'))
print(recall_score(y_test, svmy_pred, average='macro'))

[[18  0  0]
 [ 0 10  0]
 [ 0  1 16]]
0.9777777777777777
0.9740259740259741
0.9696969696969697
0.9803921568627452


In [27]:
# GridSearchCV
svmclf = SVC()
parameters = {'kernel':['rbf', 'poly', 'sigmoid', 'linear']}
grid_search = GridSearchCV(estimator=svmclf,
                           param_grid=parameters,
                           cv = 10,
                           scoring='accuracy')
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)



{'kernel': 'linear'}
0.9809523809523809




# 建立模型_NB

In [31]:
# NB建立模型
nbclf = GaussianNB()
nbclf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
# NB預測
nby_pred = nbclf.predict(X_test)

In [34]:
# 模型評估
print(confusion_matrix(y_test, nby_pred))
print(accuracy_score(y_test, nby_pred))
print(f1_score(y_test, nby_pred, average='macro'))
print(precision_score(y_test, nby_pred, average = 'macro'))
print(recall_score(y_test, nby_pred, average = 'macro'))

[[18  0  0]
 [ 0 10  0]
 [ 0  2 15]]
0.9555555555555556
0.9488636363636364
0.9444444444444445
0.9607843137254902


# 建立模型_KNN

In [35]:
# KNN建立模型
knnclf = KNeighborsClassifier()
knnclf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [36]:
# KNN預測
knny_pred = knnclf.predict(X_test)

In [39]:
# 模型評估
print(confusion_matrix(y_test, knny_pred))
print(accuracy_score(y_test, knny_pred))
print(f1_score(y_test, knny_pred, average = 'macro'))
print(precision_score(y_test, knny_pred, average = 'macro'))
print(recall_score(y_test, knny_pred, average = 'macro'))

[[18  0  0]
 [ 0  9  1]
 [ 0  2 15]]
0.9333333333333333
0.9220779220779222
0.9185606060606061
0.9274509803921568


In [45]:
# GridSearchCV
knnclf = KNeighborsClassifier()
parameters = {'n_neighbors':[5, 10,15,20],
              'weights':['uniform','distance']}
grid_search = GridSearchCV(estimator=knnclf,
                           param_grid=parameters,
                           cv = 10,
                           scoring='accuracy')
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'n_neighbors': 5, 'weights': 'uniform'}
0.9714285714285714




# 建立模型_MLP

In [66]:
# MLP建立模型
mlp = MLPClassifier(hidden_layer_sizes=(50, 50, 50), activation='tanh', solver='adam')
mlp.fit(X_train, y_train)



MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [67]:
# MLP預測
mlpy_pred = mlp.predict(X_test)

In [69]:
# 模型評估
print(confusion_matrix(y_test, mlpy_pred))
print(accuracy_score(y_test, mlpy_pred))
print(f1_score(y_test, mlpy_pred, average='macro'))
print(precision_score(y_test, mlpy_pred, average = 'macro'))
print(recall_score(y_test, mlpy_pred, average = 'macro'))

[[18  0  0]
 [ 0 10  0]
 [ 0  2 15]]
0.9555555555555556
0.9488636363636364
0.9444444444444445
0.9607843137254902


In [60]:
# GridSearchCV
mlpclf = MLPClassifier()
parameters = {'hidden_layer_sizes':[(50,50),(60,60),(50,50,50)],
              'activation':['relu', 'logistic', 'tanh'],
              'solver':['sgd','adam']}
grid_search = GridSearchCV(estimator=mlpclf,
                           param_grid=parameters,
                           cv = 10,
                           scoring='accuracy')
grid_search = grid_search.fit(X_train, y_train)











In [63]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'activation': 'tanh', 'hidden_layer_sizes': (50, 50, 50), 'solver': 'adam'}
0.9714285714285714


# 建立模型_RF

In [75]:
# RF建模型
rfclf = RandomForestClassifier(n_estimators=50, criterion='entropy')
rfclf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [76]:
# RF預測
rfy_pred = rfclf.predict(X_test)

In [77]:
# 模型評估
print(confusion_matrix(y_test, rfy_pred))
print(accuracy_score(y_test, rfy_pred))
print(f1_score(y_test, rfy_pred, average='macro'))
print(precision_score(y_test, rfy_pred,average='macro'))
print(recall_score(y_test, rfy_pred,average='macro'))

[[18  0  0]
 [ 0 10  0]
 [ 0  2 15]]
0.9555555555555556
0.9488636363636364
0.9444444444444445
0.9607843137254902


In [74]:
# GridSearchCV
rfclf = RandomForestClassifier()
parameters = {'n_estimators':[50,100,150,200],
              'criterion':['entropy', 'gini']}
grid_search = GridSearchCV(estimator=rfclf,
                           param_grid=parameters,
                           cv = 10, 
                           scoring='accuracy')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'entropy', 'n_estimators': 50}
0.9619047619047619




# 建立模型_XGB

In [21]:
# XGB建立模型
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1,
       tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
# XGB預測
xgby_pred = xgb.predict(X_test)

In [23]:
# 模型評估
print(confusion_matrix(y_test, xgby_pred))
print(accuracy_score(y_test, xgby_pred))
print(f1_score(y_test, xgby_pred, average='macro'))
print(precision_score(y_test, xgby_pred,average='macro'))
print(recall_score(y_test, xgby_pred,average='macro'))

[[18  0  0]
 [ 0 10  0]
 [ 0  3 14]]
0.9333333333333333
0.9242636746143057
0.923076923076923
0.9411764705882352


In [34]:
# GridSearchCV
xgb = XGBClassifier()
parameters = {'n_estimators':[50,100,150,200],
              'learning_rate':[i/100 for i in range(1,11)]}
grid_search = GridSearchCV(estimator=xgb,
                           param_grid=parameters,
                           cv = 10,
                           scoring='accuracy')
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'learning_rate': 0.03, 'n_estimators': 150}
0.9428571428571428




# 模型存、取

In [78]:
import pickle

In [82]:
with open('C:/Users/hsu/Desktop/DSLab/rf.pkl', mode='wb') as mode:
    pickle.dump(obj=rfclf, file=mode)

In [83]:
with open('C:/Users/hsu/Desktop/DSLab/rf.pkl', mode='rb') as re:
    model = pickle.load(file=re)

# 筆記區

In [46]:
# 筆記區
df.iloc[4:14, 2:4]
# df.iloc[X_test.index, 3:5] # 可把指數放入dataframe引數中查找

Unnamed: 0,Petal.Length,Petal.Width
4,1.4,0.2
6,1.4,0.3
8,1.4,0.2
10,1.5,0.2
12,1.4,0.1


In [None]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV

classifier = KerasClassifier(
    build_fn = buildClassifierWithDropout, 
    epochs = 10,
    verbose = 0)

parameters = {'batch_size': [10, 15],
        'optimizer': ['adam', 'rmsprop']}

grid_search = GridSearchCV(estimator = classifier,
               param_grid = parameters,
               scoring = 'accuracy',
               cv = 2)

grid_search     = grid_search.fit(train_x, train_y)
best_parameters = grid_search.best_params_
best_accuracy   = grid_search.best_score_

In [None]:
# 建置隨機種子

In [1]:
import numpy as np
import pandas as pd

In [24]:
iris = pd.read_csv('C:/Users/hsu/Desktop/DSLab/testpy/iris.csv')
df = iris.copy()

In [10]:
# 隨機種子
rds = np.random.choice(range(1,len(df.index)), 80, replace = False) 

In [18]:
# 隨機種子放入資料引數
df.iloc[rds, 1:6].head()

Unnamed: 0,Sepal.Width,Petal.Length,Petal.Width,Species
59,2.7,3.9,1.4,versicolor
27,3.5,1.5,0.2,setosa
142,2.7,5.1,1.9,virginica
14,4.0,1.2,0.2,setosa
51,3.2,4.5,1.5,versicolor


In [34]:
# 資料擷取
df2 = df[(df['Species'] == 'setosa')]

In [37]:
df3 = df.iloc[rds]

In [42]:
df4 = pd.concat([df2, df3], axis = 0)

In [43]:
df4.shape

(130, 5)