# 一、导入数据

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_excel("input/data_.xlsx",index_col=0)

In [None]:
data_ = data.copy()
data_

In [None]:
data_.info()

# 二、设置变量

In [None]:
features = data_.drop(columns=['Class','Thickness_class','Cell Size'])
features

In [None]:
target = data_.Class.replace(to_replace = [2, 4], value = [0,1])
target

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
transfer = StandardScaler()

In [None]:
features_ = transfer.fit_transform(features)

In [None]:
features_

# 三、拆分数据集

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_features, test_features, train_target, test_target = train_test_split(
    features, target, 
    test_size = 145, random_state = 99)

In [None]:
print(train_features.shape)
print(train_target.shape)
print(test_features.shape)
print(test_target.shape)

# 四、交叉验证

In [None]:
from sklearn.model_selection import KFold,ShuffleSplit,cross_val_score

In [None]:
kf = KFold(n_splits=10)

In [None]:
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=99)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
log_clf = LogisticRegression()
svc_clf = SVC()
tree_clf = DecisionTreeClassifier()

In [None]:
log_score = cross_val_score(log_clf, train_features, train_target, cv=ss)
log_score

In [None]:
svc_score = cross_val_score(svc_clf, train_features, train_target, cv=ss)
svc_score

In [None]:
tree_score = cross_val_score(tree_clf, train_features, train_target, cv=ss)
tree_score

In [None]:
print(log_score.mean())
print(svc_score.mean())
print(tree_score.mean())

# 五、网格搜索

In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [None]:
param_grid = {'C': [0.1, 1, 10],'gamma': [0.1, 0.5, 1, 5]}

In [None]:
grid_research = GridSearchCV(svc_clf, param_grid, cv=ss)

In [None]:
grid_research.fit(train_features, train_target)

In [None]:
grid_research.best_params_

In [None]:
grid_research.best_score_

In [None]:
grid_research.best_estimator_

In [None]:
param_distributions={'C': np.logspace(-3,3,200),'gamma': np.logspace(-3,3,200), 'kernel': ['linear', 'rbf']}

In [None]:
random_search =RandomizedSearchCV(svc_clf, param_distributions, cv=ss, n_iter=100)

In [None]:
random_search.fit(train_features, train_target)

In [None]:
random_search.best_params_

In [None]:
random_search.best_score_

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_estimator_.score(test_features, test_target)

# 六、知识拓展

In [None]:
Cs = np.logspace(-3,2,200)

In [None]:
acc_=[]
coef_ = []
for C_ in Cs:
    log_l1 = LogisticRegression(penalty='l1',C = C_, solver='liblinear')
    log_l1.fit(train_features, train_target)
    coef_.append(log_l1.coef_)
    score = cross_val_score(log_l1, train_features, train_target, cv=ss)
    acc_.append(score.mean())

In [None]:
import matplotlib.pyplot as plt
plt.plot(Cs, acc_)
plt.xscale('log')
plt.xlabel('C')
plt.ylabel('acc')
plt.show()

In [None]:
coef_ = np.array(coef_).reshape(200,13)
plt.plot(Cs, coef_)
plt.xscale('log')
plt.xlabel('C')
plt.ylabel('Cofficients')
plt.show()

In [None]:
np.argmax(acc_)

In [None]:
Cs[158]