In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import recall_score, precision_score, f1_score

## Import data

In [20]:
train = pd.read_csv('Train_data.csv')
test= pd.read_csv('test_data.csv')

print(f'train data dims : {train.shape}')
print(f'test data dims : {test.shape}')

train data dims : (2351, 25)
test data dims : (486, 25)


## EDA

In [21]:
print(train.isnull().sum())
print(test.isnull().sum())

Glucose                                      0
Cholesterol                                  0
Hemoglobin                                   0
Platelets                                    0
White Blood Cells                            0
Red Blood Cells                              0
Hematocrit                                   0
Mean Corpuscular Volume                      0
Mean Corpuscular Hemoglobin                  0
Mean Corpuscular Hemoglobin Concentration    0
Insulin                                      0
BMI                                          0
Systolic Blood Pressure                      0
Diastolic Blood Pressure                     0
Triglycerides                                0
HbA1c                                        0
LDL Cholesterol                              0
HDL Cholesterol                              0
ALT                                          0
AST                                          0
Heart Rate                                   0
Creatinine   

## Preprocessing

In [4]:
train.columns

Index(['Glucose', 'Cholesterol', 'Hemoglobin', 'Platelets',
       'White Blood Cells', 'Red Blood Cells', 'Hematocrit',
       'Mean Corpuscular Volume', 'Mean Corpuscular Hemoglobin',
       'Mean Corpuscular Hemoglobin Concentration', 'Insulin', 'BMI',
       'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Triglycerides',
       'HbA1c', 'LDL Cholesterol', 'HDL Cholesterol', 'ALT', 'AST',
       'Heart Rate', 'Creatinine', 'Troponin', 'C-reactive Protein',
       'Disease'],
      dtype='object')

In [5]:
test['Disease'].unique().tolist()

['Thalasse', 'Diabetes', 'Heart Di', 'Anemia', 'Thromboc', 'Healthy']

In [6]:
train['Disease'].unique().tolist()

['Healthy', 'Diabetes', 'Thalasse', 'Anemia', 'Thromboc']

In [7]:
print(f'train data dims : {train.shape}')
print(f'test data dims : {test.shape}')

train data dims : (2351, 25)
test data dims : (486, 25)


In [8]:
disease = ['Diabetes', 'Thalasse', 'Anemia', 'Thromboc']
non_disease = ['Healthy']

In [9]:
train['Disease'] = np.where(train['Disease'].isin(non_disease), 0,1)
test['Disease'] = np.where(test['Disease'].isin(non_disease), 0,1)

In [10]:
y_train = train.pop('Disease')
y_test = test.pop('Disease')

In [11]:
scaler = StandardScaler()
x_train = scaler.fit_transform(train)
x_test = scaler.fit_transform(test)

## Model

In [12]:
model = ExtraTreesClassifier(n_estimators=288, random_state = 0)
#the best n_estimators finded manually, GridSearchCV was inefficient in this case

In [13]:
model.fit(x_train, y_train)
model.score(x_train, y_train)

1.0

In [14]:
predictions = model.predict(x_test)

## Evaluation metrics

In [16]:
print(f'accuracy : {round(accuracy_score(predictions, y_test)*100, 2)} %')
print(f'precision : {round(precision_score(predictions, y_test)*100, 2)} %')
print(f'recall : {round(recall_score(predictions, y_test)*100, 2)} %')
print(f'f1_score : {round(f1_score(predictions, y_test)*100, 2)} %')

accuracy : 97.94 %
precision : 98.96 %
recall : 98.96 %
f1_score : 98.96 %
