In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score,recall_score
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.svm import SVC
import seaborn as sns
from scipy.stats import kurtosis, skew
import numpy as np
# import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'svg'
%matplotlib qt5

In [3]:
cancer_data = load_breast_cancer()
Scaler = MinMaxScaler()
X = cancer_data.data
y = cancer_data.target
X = Scaler.fit_transform(X) 


X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)

print(cancer_data["target_names"])

['malignant' 'benign']


In [4]:
def box_plot_filter(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    filter = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
    df[filter] = np.nan
    # print(df_i.isnull().sum())
    df.interpolate(method='nearest', inplace=True)
    df.fillna(method='bfill', inplace=True)
    df.fillna(method='ffill', inplace=True)
    return df

In [5]:
dfs = []
for i in range(2):
    mask = (y == i)
    x_i = X[mask]
    # print(x_i)

    df_i = pd.DataFrame(x_i, columns=cancer_data.feature_names)
    plt.figure(figsize=(10, 4))
    plt.boxplot(df_i.values, labels=cancer_data.feature_names)
    plt.show()
    plt.xticks([])
    
    df_i = box_plot_filter(df_i)
    df_i = box_plot_filter(df_i)   
    plt.figure(figsize=(10, 4))
    plt.boxplot(df_i, labels=cancer_data.feature_names)
    plt.show()
    plt.xticks([])
    dfs.append(df_i)

df = pd.concat(dfs, axis=0)
X = np.array(df)
X = Scaler.fit_transform(X)

## knn
### K值选取实验

### 准确率

In [6]:
knn_models = [KNeighborsClassifier(n_neighbors=k) for k in range(1,10)]
knn_fit = [model.fit(X_train,y_train) for model in knn_models]
knn_acc = [model.score(X_test,y_test) for model in knn_models]
plt.figure(figsize=(8,4))
plt.plot(range(1,10),knn_acc,'bo-') 
plt.rcParams['font.size'] = 14
print(knn_acc)

[0.9590643274853801, 0.9415204678362573, 0.9707602339181286, 0.9649122807017544, 0.9649122807017544, 0.9649122807017544, 0.9649122807017544, 0.9590643274853801, 0.9649122807017544]


### 五折交叉验证

In [7]:
knn_cross_val_score = [cross_val_score(model,X_train,y_train).mean() for model in knn_models]
plt.figure(figsize=(8, 4))
plt.plot(range(1, 10), knn_cross_val_score, 'bo-')

[<matplotlib.lines.Line2D at 0x191335c1630>]

### F1 score

In [8]:
knn_pre = [ model.predict(X_test) for model in knn_models]
knn_f1 = [f1_score(y_test,y_pred) for y_pred in knn_pre]
plt.figure(figsize=(8, 4))
plt.plot(range(1, 10), knn_f1, 'bo-')

[<matplotlib.lines.Line2D at 0x191336327d0>]

In [9]:
# 设置为监督学习 对分类结果进行可视化
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d  as Axes3D
import numpy as np
np.random.seed(42)

fig = plt.figure(figsize=(18,18))
# kmeans = KMeans(n_clusters=2)

# print(output)
for i in range(1,10):
    output = knn_models[i-1].predict(X_test)
    # plt.clf()
    ax = fig.add_subplot(3,3,i, projection="3d", elev=48, azim=134)
    # ax.set_position([0, 0, 0.95, 1])
    # plt.cla()
    pca = PCA(n_components=3)
    X_pca = pca.fit_transform(X_test)
    for name, label in [("malignant", 0), ("benign", 1)]:
        ax.text3D(
            X_pca[y_test == label, 0].mean(),
            X_pca[y_test == label, 1].mean() + 1.5,
            X_pca[y_test == label, 2].mean(),
            name,
            horizontalalignment="center",
            bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
            fontsize  = 12
        )
    y = output
    # Reorder the labels to have colors matching the cluster results
    # y = np.choose(y, [1, 2, 0]).astype(float)
    ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=y,
                edgecolor="k",s = 80,alpha = 0.6)
    # ax.xaxis.set_ticklabels([])
    # ax.yaxis.set_ticklabels([])
    # ax.zaxis.set_ticklabels([]) 
    ax.set_xlabel('Feature1')
    ax.set_ylabel('Feature2')
    ax.set_zlabel('Feature3')
    
    ax.view_init(elev=18, azim=107)

plt.tight_layout()
plt.show()




## 决策树

In [10]:

tree_clf = DecisionTreeClassifier(random_state=42)

# 定义参数网格
param_grid = {
    'max_depth': [None, 2, 3, 4, 5],  # 预剪枝参数
    'ccp_alpha': [0.0, 0.01, 0.02, 0.03, 0.04]   # 后剪枝参数
}
grid_search = GridSearchCV(tree_clf,param_grid,cv = 5,scoring='accuracy')
grid_search.fit(X_train,y_train)

In [11]:
results = grid_search.cv_results_
mean_test_scores = results['mean_test_score']

params_grid = list(param_grid.values())
grid_shape = [len(param) for param in params_grid]

scores_matrix = mean_test_scores.reshape(grid_shape)
plt.figure(figsize=(12, 6))
plt.title('Grid Search Scores')


param_names = list(param_grid.keys())
plt.xlabel(param_names[0])
plt.ylabel(param_names[1])

sns.heatmap(scores_matrix, cmap='PuRd')
# plt.colorbar()


# plt.xticks(params_grid[0])
# plt.yticks( params_grid[1])

plt.show()

In [12]:
print(grid_search.best_params_)

{'ccp_alpha': 0.01, 'max_depth': 4}


In [13]:
best_tree_clf = grid_search.best_estimator_
y_pre = best_tree_clf.predict(X_test)
acc = accuracy_score(y_test, y_pre)
print(f"accuracy_score is {acc}")
recall = recall_score(y_test, y_pre)
print(f"recall_score is {recall}")
f1 = f1_score(y_test, y_pre)
print(f"f1_score is {f1}")  

accuracy_score is 0.9649122807017544
recall_score is 0.9722222222222222
f1_score is 0.9722222222222222


In [14]:
plt.figure(figsize=(20, 10))
plot_tree(best_tree_clf, filled=True,  rounded=True,
          feature_names=list(cancer_data["feature_names"]),class_names=list(cancer_data["target_names"]))
plt.show()

## 逻辑回归

In [15]:
from sklearn.linear_model import LogisticRegression
log_res = LogisticRegression(random_state=42)
log_res.fit(X_train, y_train)

y_pre = log_res.predict(X_test)
acc = accuracy_score(y_test,y_pre)
print(f"accuracy_score is {acc}")

recall = recall_score(y_test,y_pre)
print(f"recall_score is {recall}")

f1 = f1_score(y_test, y_pre)
print(f"f1_score is {f1}")

accuracy_score is 0.9766081871345029
recall_score is 1.0
f1_score is 0.9818181818181818


## SVM

In [16]:
# RBF核
svm_clf = SVC(kernel='rbf', gamma='auto', C=1)
svm_clf.fit(X_train, y_train)

y_pre_rbf = svm_clf.predict(X_test)
acc = accuracy_score(y_test, y_pre_rbf)
print(f"accuracy_score is {acc}")

recall = recall_score(y_test, y_pre_rbf)
print(f"recall_score is {recall}")
f1 = f1_score(y_test, y_pre_rbf)
print(f"f1_score is {f1}")

accuracy_score is 0.9590643274853801
recall_score is 1.0
f1_score is 0.968609865470852


In [17]:
# 无核
svm_clf_none = SVC(kernel='linear')
svm_clf_none.fit(X_train, y_train)

y_pre_none = svm_clf_none.predict(X_test)
acc = accuracy_score(y_test, y_pre_none)
print(f"accuracy_score is {acc}")

recall = recall_score(y_test, y_pre_none)
print(f"recall_score is {recall}")

f1 = f1_score(y_test, y_pre_none)
print(f"f1_score is {f1}")

accuracy_score is 0.9883040935672515
recall_score is 1.0
f1_score is 0.9908256880733944


******

### 鸢尾花数据集

- 数据预处理
- 剔除异常数据
- 归一化

In [18]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
np.random.seed(20)
iris_data = load_iris()

X = iris_data.data
y = iris_data.target

dfs = []
for i in range(3):
    mask = (y == i)
    x_i = X[mask]
    # print(x_i)

    df_i = pd.DataFrame(x_i, columns=iris_data.feature_names)
    plt.figure(figsize=(10, 4))
    plt.boxplot(df_i.values, labels=iris_data.feature_names)
    plt.show()
    Q1 = df_i.quantile(0.25)
    Q3 = df_i.quantile(0.75)
    IQR = Q3 - Q1
    # 定义过滤器
    filter = (df_i < (Q1 - 1.5 * IQR)) | (df_i > (Q3 + 1.5 * IQR))
    # print(df_i[filter])
    df_i[filter] = np.nan

    print(df_i)
    df_i.interpolate(method='nearest', inplace=True)
    plt.figure(figsize=(10, 4))
    plt.boxplot(df_i.values, labels=iris_data.feature_names)
    plt.show()
    dfs.append(df_i)

df = pd.concat(dfs, axis=0) 
X = np.array(df)
    
print(X)


    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                 5.1               3.5                1.4               0.2
1                 4.9               3.0                1.4               0.2
2                 4.7               3.2                1.3               0.2
3                 4.6               3.1                1.5               0.2
4                 5.0               3.6                1.4               0.2
5                 5.4               3.9                1.7               0.4
6                 4.6               3.4                1.4               0.3
7                 5.0               3.4                1.5               0.2
8                 4.4               2.9                1.4               0.2
9                 4.9               3.1                1.5               0.1
10                5.4               3.7                1.5               0.2
11                4.8               3.4                1.6               0.2

In [19]:
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=200)



In [20]:
knn_models = [KNeighborsClassifier(n_neighbors=k) for k in range(1, 10)]
knn_fit = [model.fit(X_train, y_train) for model in knn_models]
knn_acc = [model.score(X_test, y_test) for model in knn_models]

In [21]:
knn_cross_val_score = [cross_val_score(
    model, X_train, y_train).mean() for model in knn_models]
plt.figure(figsize=(8, 4))
plt.plot(range(1, 10), knn_cross_val_score, 'bo-')

[<matplotlib.lines.Line2D at 0x191335f7190>]

### 选定k值为3

In [22]:
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
output = knn_clf.predict(X_test)
acc = accuracy_score(y_test, output)
recall = recall_score(y_test, output, average='macro')
f1 = f1_score(y_test, output, average='macro')


print(f"accuracy_score is {acc}")
print(f"recall_score is {recall}")
print(f"f1_score is {f1}")

accuracy_score is 1.0
recall_score is 1.0
f1_score is 1.0


In [23]:
# 设置为监督学习 对分类结果进行可视化
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d as Axes3D
import numpy as np
np.random.seed(42)

fig = plt.figure(figsize=(5, 5))
# kmeans = KMeans(n_clusters=2)


# plt.clf()
ax = fig.add_subplot(1, 1, 1, projection="3d", elev=48, azim=134)
# ax.set_position([0, 0, 0.95, 1])
# plt.cla()
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_test)
for name, label in [("setosa", 0), ("versicolor", 1),( "virginica", 2)]:
    ax.text3D(
        X_pca[y_test == label, 0].mean(),
        X_pca[y_test == label, 1].mean() + 1.5,
        X_pca[y_test == label, 2].mean(),
        name,
        horizontalalignment="center",
        bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
        fontsize=12
    )
    y = output
        
    ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=y,
                edgecolor="k", s=80, alpha=0.6)

    ax.set_xlabel('Feature1')
    ax.set_ylabel('Feature2')
    ax.set_zlabel('Feature3')

    ax.view_init(elev=18, azim=107)

    plt.tight_layout()
    plt.show()

### 相关评价指标

In [24]:
acc = accuracy_score(y_test, output)
recall = recall_score(y_test, output, average='macro')
f1 = f1_score(y_test, output, average='macro')
print(f"accuracy_score is {acc}")
print(f"recall_score is {recall}")
print(f"f1_score is {f1}")

accuracy_score is 1.0
recall_score is 1.0
f1_score is 1.0


In [25]:

tree_clf = DecisionTreeClassifier(random_state=42)

# 定义参数网格
param_grid = {
    'max_depth': [None, 2, 3, 4, 5],  # 预剪枝参数
    'ccp_alpha': [0.0, 0.01, 0.02, 0.03, 0.04]  # 后剪枝参数
}
grid_search = GridSearchCV(tree_clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [26]:
print(grid_search.best_params_)

{'ccp_alpha': 0.0, 'max_depth': None}


In [27]:
best_tree_clf = grid_search.best_estimator_
output = best_tree_clf.predict(X_test)
plt.figure(figsize=(20, 10))
plot_tree(best_tree_clf, filled=True,  rounded=True,
          feature_names=list(iris_data["feature_names"]), class_names=list(iris_data["target_names"]))
plt.show()

In [28]:
acc = accuracy_score(y_test, output)
recall = recall_score(y_test, output, average='macro')
f1 = f1_score(y_test, output, average='macro')
print(f"accuracy_score is {acc}")
print(f"recall_score is {recall}")
print(f"f1_score is {f1}")

accuracy_score is 1.0
recall_score is 1.0
f1_score is 1.0


### 逻辑回归

In [29]:

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris




param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

grid_search = GridSearchCV(LogisticRegression(
    random_state=42), param_grid, cv=5, verbose=3, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
best_log_res = grid_search.best_estimator_




Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Best Parameters:  {'C': 0.23357214690901212, 'penalty': 'l1', 'solver': 'saga'}


In [30]:

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_test)
model = LogisticRegression(random_state=42)
model.fit(X_pca, y_test)

conf_mat = confusion_matrix(y_test, output)


x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)


plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.arange(
    len(np.unique(y)) + 1) - 0.5, cmap='rainbow')
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='rainbow', edgecolor='k')
plt.title('Logistic Regression Decision Boundaries on Iris Dataset')
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.show()

In [31]:
output = best_log_res.predict(X_test)
acc = accuracy_score(y_test, output)
recall = recall_score(y_test, output, average='macro')
f1 = f1_score(y_test, output, average='macro')

print(f"accuracy_score is {acc}")
print(f"recall_score is {recall}")
print(f"f1_score is {f1}")

accuracy_score is 1.0
recall_score is 1.0
f1_score is 1.0


### SVM

In [32]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV


param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}


grid_search = GridSearchCV(svm.SVC(), param_grid,
                           refit=True, verbose=3, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)


best_svm = grid_search.best_estimator_

output = best_svm.predict(X_test)

acc = accuracy_score(y_test, output)
recall = recall_score(y_test, output, average='macro')
f1 = f1_score(y_test, output, average='macro')

print(f"accuracy_score is {acc}")
print(f"recall_score is {recall}")
print(f"f1_score is {f1}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters:  {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
accuracy_score is 1.0
recall_score is 1.0
f1_score is 1.0


In [33]:
from sklearn.svm import SVC


svm_model = SVC(kernel='rbf', C=1.0,gamma=1,)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)



model = SVC(kernel='linear', C=1.0)
model.fit(X_pca, y_test)


conf_mat_svm = confusion_matrix(y_test, y_pred_svm)

plt.figure(figsize=(10, 6))

Z_svm = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z_svm = Z_svm.reshape(xx.shape[0], xx.shape[1], 3)  
contour = plt.contour(xx, yy, Z_svm[:, :, 1], levels=[
                      0], linewidths=2, colors='k')
plt.contourf(xx, yy, Z_svm[:, :, 1], levels=np.linspace(
    Z_svm[:, :, 1].min(), Z_svm[:, :, 1].max(), 7), alpha=0.3)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_test, cmap='rainbow', edgecolor='k')

plt.title('SVM Decision Boundaries with Support Vectors on Iris Dataset')
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.show()

conf_mat_svm

array([[18,  0,  0],
       [ 0, 16,  0],
       [ 0,  0, 11]], dtype=int64)

In [34]:

print(f"accuracy_score is {acc}")
print(f"recall_score is {recall}")
print(f"f1_score is {f1}")


accuracy_score is 1.0
recall_score is 1.0
f1_score is 1.0
