In [161]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

In [162]:
data = pd.read_excel('ЛР2. Исходные данные.xlsx')
n = len(data.index) # n - кол-во объектов
m = len(data.columns) # m - кол-во признаков
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

data.head()

Unnamed: 0,х1,х2,х3,х4,х5,х6,x7,x8,x9,Ybin
0,7.0,15.9,8.2,5.1,13.8,229,172,20.0,2104,1
1,7.2,18.2,7.4,6.1,14.3,146,167,29.1,2489,0
2,7.9,19.7,6.4,4.7,19.8,174,144,22.8,2428,1
3,7.7,20.8,6.9,5.2,17.1,128,111,42.7,2494,0
4,9.2,15.9,7.8,5.3,16.7,169,148,22.7,2094,1


In [163]:
# Разделение данных для kNN

knn_X_train, knn_X_test, knn_Y_train, knn_Y_test = train_test_split(X, Y, test_size=0.15, random_state=23, stratify=Y)
knn_train_len = len(knn_X_train)
knn_test_len = len(knn_X_test)

knn_x0_test = np.ones((knn_test_len,1)) # доп. единчиный вектор-столбец
knn_X_test.insert(0, "x0", knn_x0_test)
knn_test_len += 1

In [164]:
# Нормирование данных для kNN (Z-нормализация)

knn_mean = np.mean(knn_X_train, axis=0)
knn_stddev = np.std(knn_X_train, axis=0)
knn_norm_X_train = (knn_X_train - knn_mean) / knn_stddev

knn_x0_train = np.ones((knn_train_len,1)) # доп. единчиный вектор-столбец
knn_norm_X_train.insert(0, "x0", knn_x0_train)
knn_train_len += 1

knn_norm_X_train.head()

Unnamed: 0,x0,х1,х2,х3,х4,х5,х6,x7,x8,x9
43,1.0,0.049,-0.224538,0.083263,0.128575,0.219011,1.2795,3.416575,-0.698988,0.729028
41,1.0,4.39914,-0.612527,-2.036476,-2.487061,2.248021,-1.310799,-1.498338,3.619061,1.186235
22,1.0,-0.462781,0.085853,0.24632,0.128575,-1.216142,0.352109,1.173358,-0.963358,-0.670398
30,1.0,0.134297,0.279847,2.692173,0.308964,0.070547,0.144246,-0.011262,0.023625,-0.586988
6,1.0,-1.01721,1.443815,-1.384249,0.218769,0.169523,-0.447366,-0.691788,0.138185,0.057118


In [165]:
# модель kNN

knn = KNeighborsClassifier()
knn_param_grid = {'n_neighbors': np.arange(1,17, step = 2)} # параметр числа соседей
knn_model = GridSearchCV(knn, knn_param_grid)
knn_model.fit(knn_norm_X_train, knn_Y_train)

print("Наилучший параметр k:", knn_model.best_params_['n_neighbors'])

knn_Y_pred = knn_model.predict(knn_X_test)

knn_Y_proba = knn_model.predict_proba(knn_X_test)[:, 1]
knn_threshold = np.mean(knn_Y_proba) # порог отсечения
knn_Y_pred_threshold = (knn_Y_proba >= knn_threshold).astype(int)

print('Accuracy = ', accuracy_score(knn_Y_test, knn_Y_pred))
print('ROC-AUC = ', roc_auc_score(knn_Y_test, knn_Y_pred_threshold))
print('Precision = ', precision_score(knn_Y_test, knn_Y_pred_threshold))
print('Recall = ', recall_score(knn_Y_test, knn_Y_pred_threshold))
print('F1-мера = ', f1_score(knn_Y_test, knn_Y_pred_threshold))


Наилучший параметр k: 7
Accuracy =  0.4444444444444444
ROC-AUC =  0.33333333333333337
Precision =  0.0
Recall =  0.0
F1-мера =  0.0


In [166]:
# Разделение данных для логистической регрессии

log_reg_X_train, log_reg_X_test, log_reg_Y_train, log_reg_Y_test = train_test_split(X, Y, test_size=0.15, random_state=13, stratify=Y)
log_reg_train_len = len(log_reg_X_train)
log_reg_test_len = len(log_reg_X_test)

log_reg_x0_test = np.ones((log_reg_test_len,1)) # доп. единчиный вектор-столбец
log_reg_X_test.insert(0, "x0", log_reg_x0_test)
log_reg_test_len += 1

In [167]:
# Нормирование данных для логистической регрессии (Z-нормализация)

log_reg_mean = np.mean(log_reg_X_train, axis=0)
log_reg_stddev = np.std(log_reg_X_train, axis=0)
log_reg_norm_X_train = (log_reg_X_train - log_reg_mean) / log_reg_stddev

log_reg_x0_train = np.ones((log_reg_train_len,1)) # доп. единчиный вектор-столбец
log_reg_norm_X_train.insert(0, "x0", log_reg_x0_train)
log_reg_train_len += 1

log_reg_norm_X_train.head()

Unnamed: 0,x0,х1,х2,х3,х4,х5,х6,x7,x8,x9
20,1.0,-0.265347,0.081304,0.591916,0.410374,0.158869,-0.31418,0.191031,0.008099,-0.68964
11,1.0,-0.400537,0.567646,0.758503,-0.174098,0.108246,0.843662,0.219808,-0.842312,-0.896925
26,1.0,4.026943,-2.385149,-0.407607,-2.762473,-0.220801,-1.186527,-2.139916,0.680329,-1.871928
39,1.0,-0.299144,0.150781,-0.24102,0.07639,-0.473914,0.161646,0.824128,-0.421156,0.145638
46,1.0,1.830102,-1.586157,1.424851,0.07639,0.260114,0.145785,1.370893,-0.315867,-0.628223


In [168]:
# модель логистической регрессии

log_reg = LogisticRegression()
log_reg_param_grid = {'C': [x for x in range(1,100, 5)]} # параметр регуляризации
log_reg_model = GridSearchCV(log_reg, log_reg_param_grid)
log_reg_model.fit(log_reg_norm_X_train, log_reg_Y_train)

print("Наилучший параметр С:", log_reg_model.best_params_['C'])

log_reg_Y_pred = log_reg_model.predict(log_reg_X_test)

log_reg_Y_proba = log_reg_model.predict_proba(log_reg_X_test)[:, 1]
log_reg_threshold = np.mean(log_reg_Y_proba)
log_reg_Y_pred_threshold = (log_reg_Y_proba >= log_reg_threshold).astype(int)

print('\nAccuracy = ', accuracy_score(log_reg_Y_test, log_reg_Y_pred))
print('ROC-AUC = ', roc_auc_score(log_reg_Y_test, log_reg_Y_pred_threshold))
print('Precision = ', precision_score(log_reg_Y_test, log_reg_Y_pred_threshold))
print('Recall = ', recall_score(log_reg_Y_test, log_reg_Y_pred_threshold))
print('F1-мера = ', f1_score(log_reg_Y_test, log_reg_Y_pred_threshold))

print('\n', log_reg_model.best_estimator_.coef_)

Наилучший параметр С: 26

Accuracy =  0.6666666666666666
ROC-AUC =  0.6666666666666667
Precision =  0.5
Recall =  0.6666666666666666
F1-мера =  0.5714285714285715

 [[-1.78969472e-04 -1.00306085e-01 -3.20993967e-01  1.06767214e+00
  -2.87137629e+00  9.00727762e-01 -8.76998698e-01 -1.22251097e+00
  -2.04709319e+00  2.32002750e-01]]


In [169]:
# Разделение данных для дерева решений

dec_tree_X_train, dec_tree_X_test, dec_tree_Y_train, dec_tree_Y_test = train_test_split(X, Y, test_size=0.15, random_state=32, stratify=Y)
dec_tree_train_len = len(dec_tree_X_train)
dec_tree_test_len = len(dec_tree_X_test)

dec_tree_x0_test = np.ones((dec_tree_test_len,1)) # доп. единчиный вектор-столбец
dec_tree_X_test.insert(0, "x0", dec_tree_x0_test)
dec_tree_test_len += 1

In [170]:
# Нормирование данных для дерева решений (Z-нормализация)

dec_tree_mean = np.mean(dec_tree_X_train, axis=0)
dec_tree_stddev = np.std(dec_tree_X_train, axis=0)
dec_tree_norm_X_train = (dec_tree_X_train - dec_tree_mean) / dec_tree_stddev

dec_tree_x0_train = np.ones((dec_tree_train_len,1)) # доп. единчиный вектор-столбец
dec_tree_norm_X_train.insert(0, "x0", dec_tree_x0_train)
dec_tree_train_len += 1

dec_tree_norm_X_train.head()

Unnamed: 0,x0,х1,х2,х3,х4,х5,х6,x7,x8,x9
45,1.0,0.846984,-0.654564,-0.410083,-0.518827,0.467819,-1.01856,-1.102506,2.881407,1.255017
53,1.0,-0.547914,-1.334244,0.110833,2.026393,-1.221557,0.613631,0.722921,-0.611363,1.672282
11,1.0,-0.404848,0.490162,0.805388,-0.190411,-0.018517,0.803789,0.090106,-0.769747,-0.913589
18,1.0,0.203185,-0.618791,-0.236444,-0.60093,-0.120903,0.486859,1.647804,-0.819762,-0.512485
7,1.0,-0.690981,0.633252,-0.583722,0.302212,-0.351272,0.534398,-0.055928,-0.444644,-0.453715


In [171]:
# модель дерева решений
dec_tree = DecisionTreeClassifier()
dec_tree_param_grid = {'max_depth': [_ for _ in range(1, 15, 2)]} # глубина дерева
dec_tree_model = GridSearchCV(dec_tree, dec_tree_param_grid)
dec_tree_model.fit(dec_tree_norm_X_train, dec_tree_Y_train)

print("Наилучший параметр max_depth:", dec_tree_model.best_params_['max_depth'])

dec_tree_Y_pred = dec_tree_model.predict(dec_tree_X_test)

dec_tree_Y_proba = dec_tree_model.predict_proba(dec_tree_X_test)[:, 1]
dec_tree_threshold = np.mean(dec_tree_Y_proba)
dec_tree_Y_pred_threshold = (dec_tree_Y_proba >= dec_tree_threshold).astype(int)

print('Accuracy = ', accuracy_score(dec_tree_Y_test, dec_tree_Y_pred))
print('ROC-AUC = ', roc_auc_score(dec_tree_Y_test, dec_tree_Y_pred_threshold))
print('Precision = ', precision_score(dec_tree_Y_test, dec_tree_Y_pred_threshold))
print('Recall = ', recall_score(dec_tree_Y_test, dec_tree_Y_pred_threshold))
print('F1-мера = ', f1_score(dec_tree_Y_test, dec_tree_Y_pred_threshold))

print('\n', dec_tree_model.best_estimator_.feature_importances_)

Наилучший параметр max_depth: 1
Accuracy =  0.3333333333333333
ROC-AUC =  0.5
Precision =  0.3333333333333333
Recall =  1.0
F1-мера =  0.5

 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
