In [1]:
import numpy as np
import pandas as pd

In [2]:
dt = pd.read_csv("Hungarian_80%.csv")
dt.head()

Unnamed: 0,age,resting_blood_pressure,cholesterol,fasting_blood_sugar,max_heart_rate_achieved,exercise_induced_angina,st_depression,sex_male,chest_pain_type_atypical angina,chest_pain_type_non-anginal pain,chest_pain_type_typical angina,rest_ecg_left ventricular hypertrophy,rest_ecg_normal,st_slope_flat,st_slope_normal,st_slope_upsloping,target
0,65,140,252,0,135,0,0.3,1,0,0,1,0,1,0,0,1,0
1,50,115,0,0,120,1,0.5,1,0,0,0,0,1,1,0,0,1
2,49,130,0,0,145,0,3.0,1,0,0,1,0,0,1,0,0,1
3,59,140,274,0,154,1,2.0,1,0,0,0,0,1,1,0,0,0
4,62,120,0,1,134,0,-0.8,1,0,0,1,1,0,1,0,0,1


In [3]:
dt.shape

(232, 17)

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
features = dt.drop("target", axis=1)

In [6]:
target = dt["target"]
dt.head()

Unnamed: 0,age,resting_blood_pressure,cholesterol,fasting_blood_sugar,max_heart_rate_achieved,exercise_induced_angina,st_depression,sex_male,chest_pain_type_atypical angina,chest_pain_type_non-anginal pain,chest_pain_type_typical angina,rest_ecg_left ventricular hypertrophy,rest_ecg_normal,st_slope_flat,st_slope_normal,st_slope_upsloping,target
0,65,140,252,0,135,0,0.3,1,0,0,1,0,1,0,0,1,0
1,50,115,0,0,120,1,0.5,1,0,0,0,0,1,1,0,0,1
2,49,130,0,0,145,0,3.0,1,0,0,1,0,0,1,0,0,1
3,59,140,274,0,154,1,2.0,1,0,0,0,0,1,1,0,0,0
4,62,120,0,1,134,0,-0.8,1,0,0,1,1,0,1,0,0,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                                        test_size=0.25, stratify=target,
                                                        random_state=5)
print('-----------remodeled-Training Set------------------')
print(X_train.shape)
print(y_train.shape)

-----------remodeled-Training Set------------------
(174, 16)
(174,)


In [8]:
# saving the model
from joblib import dump, load

### Loading the initial federated model

In [9]:
# loading the model from saved file
initial_fed_model = load("initial_fed_model.joblib")
initial_fed_model

RandomForestClassifier(criterion='entropy', max_depth=4, max_features='log2',
                       n_estimators=11, random_state=1)

In [10]:
model1 = initial_fed_model
model1.fit(X_train,y_train)
pred1 = model1.predict(X_test)

acc1 = accuracy_score(y_test,pred1)
print('Test Accuracy_1:', acc1*100)

precision_1 = precision_score(y_test, pred1)
print('Precision_1:', precision_1*100)

recall_1 = recall_score(y_test, pred1)
print('Recall_1:', recall_1*100)

f1_score_1 = f1_score(y_test, pred1)
print('f1_score_1:', f1_score_1*100)

Test Accuracy_1: 86.20689655172413
Precision_1: 85.71428571428571
Recall_1: 100.0
f1_score_1: 92.3076923076923


### Further tuning on existing dataset

In [11]:
# Define the grid of hyperparameters 'params_rf'
params_rf_1 = {'n_estimators': list(range(20,100,5)), 'max_depth': list(range(2,9)),
             'max_features': ['log2','sqrt'], 'criterion': ['gini', 'entropy']}

model1=RandomForestClassifier(random_state=1)

# Instantiate a 5-fold CV grid search object 'grid_rf'
grid_rf_1 = GridSearchCV(estimator=model1, param_grid=params_rf_1, scoring='accuracy', cv=5, n_jobs=-1)

grid_rf_1.fit(X_train, y_train)

# Extract best model from 'grid_rf'
best_model1 = grid_rf_1.best_estimator_

# Extract best hyperparameters from 'grid_rf'
best_hyperparams_1 = grid_rf_1.best_params_
print('Best hyerparameters', best_hyperparams_1)

# Evaluate test set accuracy
pred2 = best_model1.predict(X_test)

test_acc2 = accuracy_score(y_test, pred2)
print('Test Accuracy_2:', test_acc2*100)

precision_2 = precision_score(y_test, pred2)
print('Precision_2:', precision_2*100)

recall_2 = recall_score(y_test, pred2)
print('Recall_2:', recall_2*100)

f1_score_2 = f1_score(y_test, pred2)
print('f1_score_2:', f1_score_2*100)

Best hyerparameters {'criterion': 'gini', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 35}
Test Accuracy_2: 89.65517241379311
Precision_2: 90.38461538461539
Recall_2: 97.91666666666666
f1_score_2: 94.0


### Saving the tuned model for federated server

In [12]:
dump(best_model1, "client_hungarian_1.joblib")

['client_hungarian_1.joblib']

### Loading the tuned federated model

In [13]:
# loading the model from saved file
fed_model_2 = load("fed_model_2.joblib")
fed_model_2

RandomForestClassifier(criterion='entropy', max_depth=7, max_features='log2',
                       n_estimators=30, random_state=1)

In [14]:
model3 = fed_model_2
model3.fit(X_train,y_train)
pred3 = model3.predict(X_test)

acc3 = accuracy_score(y_test,pred3)
print('Test Accuracy_3:', acc3*100)

precision_3 = precision_score(y_test, pred3)
print('Precision_3:', precision_3*100)

recall_3 = recall_score(y_test, pred3)
print('Recall_3:', recall_1*100)

f1_score_3 = f1_score(y_test, pred3)
print('f1_score_3:', f1_score_3*100)

Test Accuracy_3: 91.37931034482759
Precision_3: 90.56603773584906
Recall_3: 100.0
f1_score_3: 95.04950495049505


###  Dataset with newly added sample(Phase 2)

In [15]:
df = pd.read_csv("Hungary_Full.csv")
df.head()

Unnamed: 0,age,resting_blood_pressure,cholesterol,fasting_blood_sugar,max_heart_rate_achieved,exercise_induced_angina,st_depression,target,sex_male,chest_pain_type_atypical angina,chest_pain_type_non-anginal pain,chest_pain_type_typical angina,rest_ecg_left ventricular hypertrophy,rest_ecg_normal,st_slope_flat,st_slope_normal,st_slope_upsloping
0,53,125,0,1,120,0,1.5,1,1,0,0,0,0,1,0,0,1
1,62,120,0,1,123,1,1.7,1,0,0,0,0,0,0,0,0,0
2,51,95,0,1,126,0,2.2,1,1,0,0,0,0,1,1,0,0
3,51,120,0,1,127,1,1.5,1,0,0,0,0,0,1,0,0,1
4,55,115,0,1,155,0,0.1,1,1,0,0,0,0,1,1,0,0


In [16]:
df.shape

(291, 17)

In [17]:
X = df.drop("target", axis=1)

In [18]:
y = df["target"]

In [19]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, 
                                                        test_size=0.25, stratify=y,
                                                        random_state=5)
print('-----------remodeled-Training Set------------------')
print(X_train2.shape)
print(y_train2.shape)

-----------remodeled-Training Set------------------
(218, 16)
(218,)


### Using the federated models parameters to evaluate dataset with new samples

In [27]:
# loading the model from saved file
fed_model_3 = load("fed_model_3.joblib")
print(fed_model_3)

model4 = fed_model_3
model4.fit(X_train2,y_train2)
pred4 = model4.predict(X_test2)

acc4 = accuracy_score(y_test2,pred4)
print('Test Accuracy_4:', acc4*100)

precision_4 = precision_score(y_test2, pred4)
print('Precision_4:', precision_4*100)

recall_4 = recall_score(y_test2, pred4)
print('Recall_4:', recall_4*100)

f1_score_4 = f1_score(y_test2, pred4)
print('f1_score_4:', f1_score_4*100)

RandomForestClassifier(criterion='entropy', max_depth=7, max_features='log2',
                       n_estimators=35, random_state=1)
Test Accuracy_4: 90.41095890410958
Precision_4: 89.55223880597015
Recall_4: 100.0
f1_score_4: 94.48818897637796
