In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn import preprocessing as prproc
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import svm
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [2]:
columns = ["Preg", "Plas", "Pres", "Skin", "Insu", "Mass", "Pedi", "Age", "Class"]
data = pd.read_csv('pima.dat', sep = ",", header=None, skiprows=13, names = columns)
data.head()

Unnamed: 0,Preg,Plas,Pres,Skin,Insu,Mass,Pedi,Age,Class
0,6,148,72,35,0,33.6,0.627,50,positive
1,1,85,66,29,0,26.6,0.351,31,negative
2,8,183,64,0,0,23.3,0.672,32,positive
3,1,89,66,23,94,28.1,0.167,21,negative
4,0,137,40,35,168,43.1,2.288,33,positive


In [3]:
scalerStd = prproc.StandardScaler()
for i in data.columns[:-1]:
    x = scalerStd.fit_transform(data[[i]])
    data[i] = x[0:]
data.head()

Unnamed: 0,Preg,Plas,Pres,Skin,Insu,Mass,Pedi,Age,Class
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995,positive
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672,negative
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584,positive
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549,negative
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496,positive


In [5]:
data_train, data_test_val = train_test_split(data, test_size=0.50, stratify = data['Class'])
data_test, data_val = train_test_split(data_test_val, test_size=0.40, stratify = data_test_val['Class'])

data_train = data_train.reset_index()
data_test = data_test.reset_index()
data_val = data_val.reset_index()

x_train = data_train[["Preg", "Plas", "Pres", "Skin", "Insu", "Mass", "Pedi", "Age"]]
y_train = data_train['Class']

x_test = data_test[["Preg", "Plas", "Pres", "Skin", "Insu", "Mass", "Pedi", "Age"]]
y_test = data_test['Class']

x_val = data_val[["Preg", "Plas", "Pres", "Skin", "Insu", "Mass", "Pedi", "Age"]]
y_val = data_val['Class']

In [6]:
# Нахождение лучших параметров для метода векторов
gammas = ['scale', 'auto', 1, 10, 0.1, 20, 0.01]
coef0s = [0, 1, 2, 5]
degrees = [2, 3, 4, 5]
Cs = [0.001, 0.01, 0.1, 0.5, 1]

best = [0]
params_scv=0
for krl in ['linear', 'sigmoid', 'rbf']:
    for gamma in gammas:
        for coef0 in coef0s:
            for degree in degrees:
                for C in Cs:
                    clf = svm.SVC(kernel = krl, gamma = gamma, coef0 = coef0, degree = degree, C = C, probability=True)

                    clf.fit(x_train, y_train)
                    
                    roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:,1], multi_class='ovr', average='macro')

                    if roc_auc > best[0]:
                        best = [roc_auc, gamma, coef0, degree, C]
                        params_scv = clf.get_params()
    print(krl, best)

linear [0.8035, 'auto', 5, 2, 0.5]
sigmoid [0.8035, 'auto', 5, 2, 0.5]
rbf [0.8065833333333333, 0.1, 5, 4, 1]


In [7]:
# Нахождение лучших параметров для дерева
criterions = ['gini', 'entropy']
alphas = [0.005, 0.01, 0.015, 0.02, 0.25, 0.03, 0.035, 0.2, 0.8]

best = [0]
param_tree = 0
for criterion in criterions:
    for alpha in alphas:
        clf = tree.DecisionTreeClassifier(criterion = criterion, ccp_alpha = alpha)
        clf.fit(x_train, y_train)
        roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:,1], multi_class='ovr', average='macro')
                    
        if roc_auc > best[0]:
            best = [roc_auc, criterion, alpha]
            param_tree = clf.get_params()

print(best)

[0.7799166666666666, 'entropy', 0.015]


In [9]:
# Нахождение лучших параметров для случайного леса
criterions = ['gini', 'entropy']
array_max_features = [2, 3, 4, 5, 6]
array_number_of_trees = [100, 200, 300, 400, 500, 600, 700, 800, 900]
best = [0]
param_random_forest = 0
for crit in criterions:
    for max_features in array_max_features:
        for number_of_trees in array_number_of_trees:
            clf = RandomForestClassifier(n_estimators = number_of_trees, max_features = max_features, criterion = crit)
            clf.fit(x_train, y_train)
            roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:,1], multi_class='ovr', average='macro')
            
            if roc_auc > best[0]:
                best = [roc_auc, number_of_trees, max_features, crit]
                param_random_forest = clf.get_params()
print(best)

[0.8037916666666667, 200, 2, 'entropy']


In [13]:
# обогащение обучающей выборки
table_stat = {}
for i in range(0, data_train['Class'].size):
    if data_train['Class'][i] in table_stat:
        table_stat[data_train['Class'][i]].append(data_train.iloc[i])
    else:
        table_stat[data_train['Class'][i]] = []
        table_stat[data_train['Class'][i]].append(data_train.iloc[i])
maximum = 0
for j in table_stat:
    if len(table_stat[j]) > maximum:
        maximum = len(table_stat[j])

random.seed(22222)

for cl in table_stat:
    while len(table_stat[cl]) < maximum:
        i = random.randint(0, len(table_stat[cl]) - 1)
        table_stat[cl].append(table_stat[cl][i])
        data_train = data_train.append(table_stat[cl][i])
        
x_train_enrichment = data_train[["Preg", "Plas", "Pres", "Skin", "Insu", "Mass", "Pedi", "Age"]]
y_train_enrichment = data_train['Class']


In [14]:
# Нахождение лучших параметров для метода векторов после обогащения
gammas = ['scale', 'auto', 1, 10, 0.1, 20, 0.01]
coef0s = [0, 1, 2, 5]
degrees = [2, 3, 4, 5]
Cs = [0.001, 0.01, 0.1, 0.5, 1]

best = [0]
params_scv_enrichment = 0
for krl in ['linear', 'sigmoid', 'rbf']:
    for gamma in gammas:
        for coef0 in coef0s:
            for degree in degrees:
                for C in Cs:
                    clf = svm.SVC(kernel = krl, gamma = gamma, coef0 = coef0, degree = degree, C = C, probability=True)

                    clf.fit(x_train_enrichment, y_train_enrichment)
                    
                    roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:,1], multi_class='ovr', average='macro')

                    if roc_auc > best[0]:
                        best = [roc_auc, gamma, coef0, degree, C]
                        params_scv_enrichment = clf.get_params()
    print(krl, best)

linear [0.8045416666666666, 0.01, 0, 5, 0.01]
sigmoid [0.8047500000000001, 0.01, 0, 2, 1]
rbf [0.8178333333333334, 0.1, 0, 2, 1]


In [15]:
# Нахождение лучших параметров для дерева после обогащения
criterions = ['gini', 'entropy']
alphas = [0.005, 0.01, 0.015, 0.02, 0.25, 0.03, 0.035, 0.2, 0.8]

best = [0]
param_tree_enrichment = 0
for criterion in criterions:
    for alpha in alphas:
        clf = tree.DecisionTreeClassifier(criterion = criterion, ccp_alpha = alpha)
        clf.fit(x_train_enrichment, y_train_enrichment)
        roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:,1], multi_class='ovr', average='macro')
                    
        if roc_auc > best[0]:
            best = [roc_auc, criterion, alpha]
            param_tree_enrichment = clf.get_params()

print(best)

[0.7781250000000001, 'entropy', 0.015]


In [16]:
# Нахождение лучших параметров для случайного леса после обогащения
criterions = ['gini', 'entropy']
array_max_features = [2, 3, 4, 5, 6]
array_number_of_trees = [100, 200, 300, 400, 500, 600, 700, 800, 900]
best = [0]
param_random_forest_enrichment = 0
for crit in criterions:
    for max_features in array_max_features:
        for number_of_trees in array_number_of_trees:
            clf = RandomForestClassifier(n_estimators = number_of_trees, max_features = max_features, criterion = crit)
            clf.fit(x_train_enrichment, y_train_enrichment)
            roc_auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:,1], multi_class='ovr', average='macro')
            
            if roc_auc > best[0]:
                best = [roc_auc, number_of_trees, max_features, crit]
                param_random_forest_enrichment = clf.get_params()
print(best)

[0.794625, 600, 2, 'entropy']


In [17]:
# Создание 3 классификаторов с лучшими параметрами для рассмотрения с валидационной выборкой
clf1 = svm.SVC(**params_scv)
clf1.fit(x_train, y_train)
clf2 = tree.DecisionTreeClassifier(**param_tree)
clf2.fit(x_train, y_train)
clf3 = RandomForestClassifier(**param_random_forest)
clf3.fit(x_train, y_train)

# Создание 3 классификаторов после обогащения с лучшими параметрами для рассмотрения 
# с валидационной выборкой
clf4 = svm.SVC(**params_scv_enrichment)
clf4.fit(x_train_enrichment, y_train_enrichment)
clf5 = tree.DecisionTreeClassifier(**param_tree_enrichment)
clf5.fit(x_train_enrichment, y_train_enrichment)
clf6 = RandomForestClassifier(**param_random_forest_enrichment)
clf6.fit(x_train_enrichment, y_train_enrichment)

# ROC-AUC для всех методов
print("CSV без обогащения:")
print(roc_auc_score(y_val, clf1.predict_proba(x_val)[:,1], multi_class='ovr', average='macro'))
print("Деревья без обогащения:")
print(roc_auc_score(y_val, clf2.predict_proba(x_val)[:,1], multi_class='ovr', average='macro'))
print("Лес без обогащения:")
print(roc_auc_score(y_val, clf3.predict_proba(x_val)[:,1], multi_class='ovr', average='macro'))
print("CSV с обогащением:")
print(roc_auc_score(y_val, clf4.predict_proba(x_val)[:,1], multi_class='ovr', average='macro'))
print("Деревья с обогащением:")
print(roc_auc_score(y_val, clf5.predict_proba(x_val)[:,1], multi_class='ovr', average='macro'))
print("Лес с обогащением:")
print(roc_auc_score(y_val, clf6.predict_proba(x_val)[:,1], multi_class='ovr', average='macro'))

CSV без обогащения:
0.8483333333333334
Деревья без обогащения:
0.7922222222222222
Лес без обогащения:
0.835
CSV с обогащением:
0.8501851851851853
Деревья с обогащением:
0.8039814814814814
Лес с обогащением:
0.8484259259259259
