In [1]:
import numpy as np
import pandas as pd
from random import randint, choice
from collections import Counter

import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC 
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
from sklearn.metrics import auc,roc_auc_score, matthews_corrcoef, f1_score, roc_auc_score,classification_report, confusion_matrix,accuracy_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_pickle("data/FTv.pkl")
df1=df.copy(deep=True)
df1 = df1.sample(frac=1).reset_index(drop=True)

In [None]:
#Подбор параметров KNN

cv = KFold(n_splits=5, random_state=42, shuffle=True)
classifiers = []

for train_index, test_index in cv.split(df1['risk_st']):
    X_train = np.hstack((int(df1['weeks'][train_index]),int(df1['trim1'][train_index]),int(df1['trim2'][train_index]),int(df1['trim3'][train_index]),int(df1['age1'][train_index]),int(df1['age2'][train_index]),int(df1['age3'][train_index]),df1['com'][train_index],df1['diagnos'][train_index],df1['fir'][train_index],df1['sec'][train_index],df1['thr'][train_index]))
    X_test = np.hstack((int(df1['weeks'][test_index]),int(df1['trim1'][test_index]),int(df1['trim2'][test_index]),int(df1['trim3'][test_index]),int(df1['age1'][test_index]),int(df1['age2'][test_index]),int(df1['age3'][test_index]),df1['com'][test_index],df1['diagnos'][test_index],df1['fir'][test_index],df1['sec'][test_index],df1['thr'][test_index]))
    Y_train, Y_test = df1['risk_st'][train_index], df1['risk_st'][test_index]
    
    neighbors = np.arange(10, 110, 10)
    train_accuracy = np.empty(len(neighbors))
    test_accuracy = np.empty(len(neighbors))
    test_mcc = np.empty(len(neighbors))
    # Цикл по значениям K
    for i, k in enumerate(neighbors):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, Y_train)
        y_pred=knn.predict(X_test)
        
        mcc=matthews_corrcoef(Y_test , y_pred)
        print(mcc)
        test_mcc[i] = mcc
        train_accuracy[i] = knn.score(X_train, Y_train)
        test_accuracy[i] = knn.score(X_test, Y_test)

    plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
    plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
    plt.plot(neighbors, test_mcc, label = 'Matthews_corrcoef')
  
    plt.legend()
    plt.xlabel('n_neighbors')
    plt.ylabel('Accuracy')
    plt.show()

In [None]:
#Проверка качества на оптимальных значениях

X=[]
Y=[]
for i in range(len(df1)):
    Y.append(df1['risk_st'][i])
    X.append(np.hstack((int(df1['weeks'][i]),int(df1['trim1'][i]),int(df1['trim2'][i]),int(df1['trim3'][i]),int(df1['age1'][i]),int(df1['age2'][i]),int(df1['age3'][i]),df1['com'][i],df1['diagnos'][i],df1['fir'][i],df1['sec'][i],df1['thr'][i])))
X=np.array(X)
Y=np.array(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1)

knn = KNeighborsClassifier(n_neighbors=90)
knn.fit(X_train, Y_train)
y_pred=knn.predict(X_test)

mcc=matthews_corrcoef(Y_test , y_pred)
print(mcc)
print(classification_report(Y_test, y_pred))
cm = confusion_matrix(Y_test, y_pred)
print(cm)

In [None]:
#Подбор параметров SVM

cv = KFold(n_splits=5, random_state=42, shuffle=True)
classifiers = []

for train_index, test_index in cv.split(df1['risk_st']):
    X_train = np.hstack((int(df1['weeks'][train_index]),int(df1['trim1'][train_index]),int(df1['trim2'][train_index]),int(df1['trim3'][train_index]),int(df1['age1'][train_index]),int(df1['age2'][train_index]),int(df1['age3'][train_index]),df1['com'][train_index],df1['diagnos'][train_index],df1['fir'][train_index],df1['sec'][train_index],df1['thr'][train_index]))
    X_test = np.hstack((int(df1['weeks'][test_index]),int(df1['trim1'][test_index]),int(df1['trim2'][test_index]),int(df1['trim3'][test_index]),int(df1['age1'][test_index]),int(df1['age2'][test_index]),int(df1['age3'][test_index]),df1['com'][test_index],df1['diagnos'][test_index],df1['fir'][test_index],df1['sec'][test_index],df1['thr'][test_index]))
    Y_train, Y_test = df1['risk_st'][train_index], df1['risk_st'][test_index]
    
    c = np.arange(0.25, 1.25, 0.25)
    for k in enumerate(c):
        svm_model_linear = SVC(kernel = 'linear', C = k).fit(X_train, Y_train)
        svm_predictions = svm_model_linear.predict(X_test)
        mcc=matthews_corrcoef(Y_test , svm_predictions)
        print(mcc)

        svm_model_linear = LinearSVC(C = k).fit(x_train, y_train)
        svm_predictions = svm_model_linear.predict(x_test)
        mcc=matthews_corrcoef(Y_test , svm_predictions)
        print(mcc)
        print()

In [None]:
#Проверка качества на оптимальных значениях

X=[]
Y=[]
for i in range(len(df1)):
    Y.append(df1['risk_st'][i])
    X.append(np.hstack((int(df1['weeks'][i]),int(df1['trim1'][i]),int(df1['trim2'][i]),int(df1['trim3'][i]),int(df1['age1'][i]),int(df1['age2'][i]),int(df1['age3'][i]),df1['com'][i],df1['diagnos'][i],df1['fir'][i],df1['sec'][i],df1['thr'][i])))
X=np.array(X)
Y=np.array(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1)

svm_model_linear = LinearSVC(C = 0.25).fit(X_train, Y_train)
#svm_model_linear = SVC(kernel = 'linear', C = 0.5).fit(X_train, Y_train)
y_pred = svm_model_linear.predict(X_test)

mcc=matthews_corrcoef(Y_test , y_pred)
print(mcc)
print(classification_report(Y_test, y_pred))
cm = confusion_matrix(Y_test, y_pred)
print(cm)

In [None]:
#Подбор параметров RF

cv = KFold(n_splits=5, random_state=42, shuffle=True)
classifiers = []

for train_index, test_index in cv.split(df1['risk_st']):
    X_train = np.hstack((int(df1['weeks'][train_index]),int(df1['trim1'][train_index]),int(df1['trim2'][train_index]),int(df1['trim3'][train_index]),int(df1['age1'][train_index]),int(df1['age2'][train_index]),int(df1['age3'][train_index]),df1['com'][train_index],df1['diagnos'][train_index],df1['fir'][train_index],df1['sec'][train_index],df1['thr'][train_index]))
    X_test = np.hstack((int(df1['weeks'][test_index]),int(df1['trim1'][test_index]),int(df1['trim2'][test_index]),int(df1['trim3'][test_index]),int(df1['age1'][test_index]),int(df1['age2'][test_index]),int(df1['age3'][test_index]),df1['com'][test_index],df1['diagnos'][test_index],df1['fir'][test_index],df1['sec'][test_index],df1['thr'][test_index]))
    Y_train, Y_test = df1['risk_st'][train_index], df1['risk_st'][test_index]
    
    n = np.arange(100, 600, 100)
    for i in enumerate(n):
        msl = np.arange(1, 4, 1)
        for j in enumerate(msl):
            mss = np.arange(2, 5, 1)
            for k in enumerate(mss):
                md = np.arange(5, 25, 5)
                for l in enumerate(md):
                    rfc = RandomForestClassifier(n_estimators=i,min_samples_leaf=j,min_samples_split=k,max_depth=l)
                    rfc.fit(X_train, Y_train)
                    y_pred = rfc.predict(X_test)
                    mcc=matthews_corrcoef(Y_test , y_pred)
                    print(i, j, k, l, mcc)
                    print()

In [None]:
#Проверка качества на оптимальных значениях

X=[]
Y=[]
for i in range(len(df1)):
    Y.append(df1['risk_st'][i])
    X.append(np.hstack((int(df1['weeks'][i]),int(df1['trim1'][i]),int(df1['trim2'][i]),int(df1['trim3'][i]),int(df1['age1'][i]),int(df1['age2'][i]),int(df1['age3'][i]),df1['com'][i],df1['diagnos'][i],df1['fir'][i],df1['sec'][i],df1['thr'][i])))
X=np.array(X)
Y=np.array(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1)

rfc = RandomForestClassifier(n_estimators=400,min_samples_leaf=1, max_depth=18,min_samples_split=2)
rfc.fit(X_train, Y_train)
y_pred = rfc.predict(X_test)

mcc=matthews_corrcoef(Y_test , y_pred)
print(mcc)
print(classification_report(Y_test, y_pred))
cm = confusion_matrix(Y_test, y_pred)
print(cm)

In [None]:
#Подбор параметров LR

cv = KFold(n_splits=5, random_state=42, shuffle=True)
classifiers = []

for train_index, test_index in cv.split(df1['risk_st']):
    X_train = np.hstack((int(df1['weeks'][train_index]),int(df1['trim1'][train_index]),int(df1['trim2'][train_index]),int(df1['trim3'][train_index]),int(df1['age1'][train_index]),int(df1['age2'][train_index]),int(df1['age3'][train_index]),df1['com'][train_index],df1['diagnos'][train_index],df1['fir'][train_index],df1['sec'][train_index],df1['thr'][train_index]))
    X_test = np.hstack((int(df1['weeks'][test_index]),int(df1['trim1'][test_index]),int(df1['trim2'][test_index]),int(df1['trim3'][test_index]),int(df1['age1'][test_index]),int(df1['age2'][test_index]),int(df1['age3'][test_index]),df1['com'][test_index],df1['diagnos'][test_index],df1['fir'][test_index],df1['sec'][test_index],df1['thr'][test_index]))
    Y_train, Y_test = df1['risk_st'][train_index], df1['risk_st'][test_index]
    
    c = np.arange(0.25, 1.25, 0.25)
    for k in enumerate(c):
        classifier = LogisticRegression(C = k, solver='lbfgs')
        classifier.fit(X_train, Y_train)
        y_pred = classifier.predict(X_test)
        mcc=matthews_corrcoef(Y_test , y_pred)
        print(mcc)

        classifier = LogisticRegression(C = k, solver='newton-cg')
        classifier.fit(X_train, Y_train)
        y_pred = classifier.predict(X_test)
        mcc=matthews_corrcoef(Y_test , y_pred)
        print(mcc)
        print()

In [None]:
#Проверка качества на оптимальных значениях

X=[]
Y=[]
for i in range(len(df1)):
    Y.append(df1['risk_st'][i])
    X.append(np.hstack((int(df1['weeks'][i]),int(df1['trim1'][i]),int(df1['trim2'][i]),int(df1['trim3'][i]),int(df1['age1'][i]),int(df1['age2'][i]),int(df1['age3'][i]),df1['com'][i],df1['diagnos'][i],df1['fir'][i],df1['sec'][i],df1['thr'][i])))
X=np.array(X)
Y=np.array(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1)

classifier = LogisticRegression(C = 0.75, solver='newton-cg')
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_test)

mcc=matthews_corrcoef(Y_test , y_pred)
print(mcc)
print(classification_report(Y_test, y_pred))
cm = confusion_matrix(Y_test, y_pred)
print(cm)