In [1]:
!pip install xgboost scikit-learn



In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [3]:
df = pd.read_csv('parkinsons.data')
df.head()


Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [4]:
# создаем numpy.ndarray с данными
data = df.loc[:, df.columns != 'status'].values[:, 1:]
# создаем numpy.ndarray со статусами (наличие или отсутствие диагноза б.Паркинсона)
statuses = df.loc[:, 'status'].values

# Смотрим распределение данных
msg = f"Количество данных с наличием диагноза б.Паркинсона: "\
      f"{statuses[statuses==1].shape[0]}\n"\
      f"Количество данных с отсутствием диагноза б.Паркинсона: "\
      f"{statuses[statuses==0].shape[0]}"
print(msg)


Количество данных с наличием диагноза б.Паркинсона: 147
Количество данных с отсутствием диагноза б.Паркинсона: 48


In [5]:
# Нормализуем данные в пределах от -1 до 1
scaler = MinMaxScaler(
    (-1, 1)
)
transformed_data = scaler.fit_transform(data)


In [6]:
# Разделяем выборку
x_train, x_test, y_train, y_test = train_test_split(
    transformed_data,  # x
    statuses,  # y
    test_size=0.2,  # 20% на тестовую выборку
    random_state=2,  # управляет перетасовкой, применяемой к данным перед применением разделения
    shuffle=True,  # перетасовка данных перед разделением
)


In [7]:
# Инициализируем модель
model = XGBClassifier()
# Обучаем модель
model.fit(x_train, y_train)


In [8]:
# используем обученную модель на тестовых данных
y_pred = model.predict(x_test)
# выводим точность модели
print(accuracy_score(y_test, y_pred) * 100)


87.17948717948718


In [9]:
# Посмотрим с какими значениями random_state точность будет выше
for i in range(21):
    x_train, x_test, y_train, y_test = train_test_split(
        transformed_data,
        statuses,
        test_size=0.2,
        random_state=i,
        shuffle=True,
    )
    model = XGBClassifier()
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    accuracy = round((accuracy_score(y_test, y_pred) * 100), 2)

    print(f"При random_state {i} - точность модели {accuracy} %")

При random_state 0 - точность модели 94.87 %
При random_state 1 - точность модели 92.31 %
При random_state 2 - точность модели 87.18 %
При random_state 3 - точность модели 97.44 %
При random_state 4 - точность модели 84.62 %
При random_state 5 - точность модели 89.74 %
При random_state 6 - точность модели 92.31 %
При random_state 7 - точность модели 94.87 %
При random_state 8 - точность модели 89.74 %
При random_state 9 - точность модели 89.74 %
При random_state 10 - точность модели 100.0 %
При random_state 11 - точность модели 94.87 %
При random_state 12 - точность модели 87.18 %
При random_state 13 - точность модели 89.74 %
При random_state 14 - точность модели 92.31 %
При random_state 15 - точность модели 94.87 %
При random_state 16 - точность модели 82.05 %
При random_state 17 - точность модели 94.87 %
При random_state 18 - точность модели 89.74 %
При random_state 19 - точность модели 89.74 %
При random_state 20 - точность модели 92.31 %
