In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import seaborn as sns

In [16]:
dataset = pd.read_csv('/content/sample_data/heart failure classification dataset.csv')
dataset.head(10)

Unnamed: 0.1,Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,time,DEATH_EVENT
0,0.0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,,4.0,1.0
1,1.0,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,6.0,1.0
2,2.0,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,7.0,1.0
3,3.0,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,7.0,1.0
4,4.0,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,8.0,1.0
5,5.0,90.0,1.0,47.0,0.0,40.0,1.0,204000.0,2.1,132.0,8.0,1.0
6,6.0,75.0,1.0,246.0,0.0,15.0,0.0,127000.0,1.2,137.0,10.0,1.0
7,7.0,60.0,1.0,315.0,1.0,60.0,0.0,454000.0,1.1,131.0,10.0,1.0
8,8.0,65.0,0.0,157.0,0.0,65.0,0.0,263358.03,1.5,138.0,10.0,1.0
9,9.0,80.0,1.0,123.0,0.0,35.0,1.0,388000.0,9.4,133.0,10.0,1.0


In [17]:
dataset.dtypes

Unnamed: 0                  float64
age                         float64
anaemia                     float64
creatinine_phosphokinase    float64
diabetes                    float64
ejection_fraction           float64
high_blood_pressure         float64
platelets                   float64
serum_creatinine            float64
serum_sodium                float64
time                        float64
DEATH_EVENT                 float64
dtype: object

In [18]:
dataset.isnull().sum()

Unnamed: 0                  0
age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                6
time                        7
DEATH_EVENT                 0
dtype: int64

In [19]:
dataset[dataset.isnull().any(axis=1)]

Unnamed: 0.1,Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,time,DEATH_EVENT
0,0.0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,,4.0,1.0
47,47.0,60.0,0.0,582.0,1.0,38.0,1.0,451000.0,0.6,,40.0,1.0
83,83.0,79.0,1.0,55.0,0.0,50.0,1.0,172000.0,1.8,133.0,,0.0
105,105.0,72.0,1.0,328.0,0.0,30.0,1.0,621000.0,1.7,138.0,,1.0
124,124.0,60.0,0.0,582.0,0.0,40.0,0.0,217000.0,3.7,,96.0,1.0
147,147.0,64.0,0.0,1610.0,0.0,60.0,0.0,242000.0,1.0,137.0,,0.0
190,190.0,80.0,0.0,582.0,1.0,35.0,0.0,350000.0,2.1,134.0,,0.0
230,230.0,60.0,0.0,166.0,0.0,30.0,0.0,62000.0,1.7,,207.0,1.0
232,232.0,40.0,1.0,129.0,0.0,35.0,0.0,255000.0,0.9,137.0,,0.0
275,275.0,45.0,0.0,582.0,0.0,38.0,1.0,422000.0,0.8,,245.0,0.0


In [20]:
dataset.shape

(299, 12)

In [21]:
dataset = dataset.dropna(axis=0)
dataset.isnull().sum()

Unnamed: 0                  0
age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
time                        0
DEATH_EVENT                 0
dtype: int64

In [22]:
dataset['anaemia'].value_counts()

0.0    160
1.0    126
Name: anaemia, dtype: int64

In [23]:
dataset['anaemia'] = dataset['anaemia'].map({'F':0,'M':1})

In [26]:
column = list(dataset.columns.values)
for column_name in column:
  if (dataset[column_name] > 1).any() or (dataset[column_name] < 0).any():
    min =dataset[column_name].min()
    max =dataset[column_name].max()
    dataset[column_name] = (dataset[column_name]- min)/(max-min)

In [27]:
y = dataset['anaemia']
y

1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
       ..
292   NaN
293   NaN
294   NaN
296   NaN
297   NaN
Name: anaemia, Length: 286, dtype: float64

In [28]:
x = dataset.drop(['anaemia'], axis = 1)
x.head(10)

Unnamed: 0.1,Unnamed: 0,age,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,time,DEATH_EVENT
1,0.0,0.272727,1.0,0.0,0.363636,0.0,0.288833,0.067416,0.657143,0.0,1.0
2,0.003378,0.454545,0.015693,0.0,0.090909,0.0,0.16596,0.089888,0.457143,0.00365,1.0
3,0.006757,0.181818,0.011227,0.0,0.090909,0.0,0.224148,0.157303,0.685714,0.00365,1.0
4,0.010135,0.454545,0.017479,1.0,0.090909,0.0,0.365984,0.247191,0.085714,0.007299,1.0
5,0.013514,0.909091,0.003062,0.0,0.393939,1.0,0.216875,0.179775,0.542857,0.007299,1.0
6,0.016892,0.636364,0.028451,0.0,0.015152,0.0,0.12353,0.078652,0.685714,0.014599,1.0
7,0.02027,0.363636,0.037254,1.0,0.69697,0.0,0.519942,0.067416,0.514286,0.014599,1.0
8,0.023649,0.454545,0.017096,0.0,0.772727,0.0,0.288833,0.11236,0.714286,0.014599,1.0
9,0.027027,0.727273,0.012758,0.0,0.318182,1.0,0.439932,1.0,0.571429,0.014599,1.0
10,0.030405,0.636364,0.0074,0.0,0.363636,1.0,0.415687,0.393258,0.514286,0.014599,1.0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print(f'The shape of X_train is {X_train.shape}')
print(f'The shape of X_test is {X_test.shape}')
print(f'The shape of y_train is {y_train.shape}')
print(f'The shape of y_test is {y_test.shape}')

The shape of X_train is (228, 11)
The shape of X_test is (58, 11)
The shape of y_train is (228,)
The shape of y_test is (58,)


In [34]:
from sklearn.svm import SVC
svc = SVC(kernel="linear")
svc.fit(X_train, y_train)
print("Training accuracy of the model is {:.2f}".format(svc.score(X_train, y_train)))
print("Testing accuracy of the model is {:.2f}".format(svc.score(X_test, y_test)))
predictions = svc.predict(X_test)
print(predictions)

ValueError: ignored

In [33]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train, y_train)
print("The Training accuracy of the model is {:.2f}".format(rfc.score(X_train, y_train)))
print("The Testing accuracy of the model is {:.2f}".format(rfc.score(X_test, y_test)))
predictions = rfc.predict(X_test)

ValueError: ignored

In [35]:
from sklearn.neural_network import MLPClassifier
nnc=MLPClassifier(hidden_layer_sizes=(7), activation="relu", max_iter=10000)
nnc.fit(X_train, y_train)
print("The Training accuracy of the model is {:.2f}".format(nnc.score(X_train, y_train)))
print("The Testing accuracy of the model is {:.2f}".format(nnc.score(X_test, y_test)))
predictions = nnc.predict(x_test)
print(predictions)

ValueError: ignored

In [38]:
# evaluate pca with logistic regression algorithm for classification
from numpy import mean
from numpy import std
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# define dataset
dataset = pd.read_csv('/content/sample_data/heart failure classification dataset.csv')
X, y = dataset(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

TypeError: ignored

In [39]:
from sklearn.svm import SVC
svc = SVC(kernel="linear")
svc.fit(X_train, y_train)
print("Training accuracy of the model is {:.2f}".format(svc.score(X_train, y_train)))
print("Testing accuracy of the model is {:.2f}".format(svc.score(X_test, y_test)))
predictions = svc.predict(X_test)
print(predictions)

ValueError: ignored

In [40]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train, y_train)
print("The Training accuracy of the model is {:.2f}".format(rfc.score(X_train, y_train)))
print("The Testing accuracy of the model is {:.2f}".format(rfc.score(X_test, y_test)))
predictions = rfc.predict(X_test)

ValueError: ignored

In [41]:
from sklearn.neural_network import MLPClassifier
nnc=MLPClassifier(hidden_layer_sizes=(7), activation="relu", max_iter=10000)
nnc.fit(X_train, y_train)
print("The Training accuracy of the model is {:.2f}".format(nnc.score(X_train, y_train)))
print("The Testing accuracy of the model is {:.2f}".format(nnc.score(X_test, y_test)))
predictions = nnc.predict(x_test)
print(predictions)

ValueError: ignored