In [None]:
pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.3 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-2.2.0


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 


hf = pd.read_csv("/content/sample_data/hf.csv")

# data pre-processing before classification
hf_subset = hf[hf['serum_sodium'].notnull()]  # removing rows where theres null at serum_sodium
hf = hf_subset.drop(['time'], axis=1)  # dropping irrelevant columns
 
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
 
hf['sex'] = encode.fit_transform(hf['sex'])  # encode
hf['smoking'] = encode.fit_transform(hf['smoking'])  # encode
# dropping features with less correlation
hf = hf.drop(
    ['diabetes', 'smoking', 'ejection_fraction', 'creatinine_phosphokinase', 'serum_sodium'], axis=1)
hf.head()
 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
m = pd.DataFrame(hf['DEATH_EVENT'])
x_train, x_test, y_train, y_test = train_test_split(hf, hf['DEATH_EVENT'], test_size=0.2, stratify=m, random_state=1)

# SVC 
from sklearn.svm import SVC
svc = SVC(kernel="linear")
svc.fit(x_train, y_train)
svc_pred = svc.predict(x_test)
svc_acc = accuracy_score(y_test, svc_pred)


# Random Forest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(x_train, y_train)
rfc_pred = rfc.predict(x_test)
rfc_acc = accuracy_score(y_test, rfc_pred)


# MLP
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(7), activation="relu", max_iter=10000)

mlp.fit(x_train, y_train)
mlp_pred = mlp.predict(x_test)
mlp_acc = accuracy_score(y_test, mlp_pred)

from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
hf_s = hf.drop(['DEATH_EVENT'], axis=1)
hf_scaled = pd.DataFrame(scaler.fit_transform(hf_s))
hf_scaled = hf_scaled.assign(target = hf["DEATH_EVENT"])
hf_scaled


In [None]:
# PCA
from sklearn.decomposition import PCA 
pca = PCA(n_components=3)
principal_components= pca.fit_transform(hf)
# print(principal_components)

sum(pca.explained_variance_ratio_)  # to check data loss after dimension reduction

x_train, x_test, y_train, y_test = train_test_split(hf_scaled , hf_scaled['target'] , test_size=0.2, random_state=42)

principal_df = pd.DataFrame(data=principal_components, columns=["principle component 1", "principle component 2", "principle component 3"])
#principal_df.head()
main_df=pd.concat([principal_df, hf_scaled[["target"]]], axis=1)


In [None]:
main_df

In [None]:
X= main_df.drop("target" , axis=1)
y= main_df["target"]

x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(X , y , test_size=0.2, random_state=42)




In [None]:
# svc
svc.fit(x_train_pca, y_train_pca)
svc_pred_pca = svc.predict(x_test_pca)
svc_acc_pca = accuracy_score(y_test_pca, svc_pred_pca)

# Random forest
rfc.fit(x_train_pca, y_train_pca)
rfc_pred_pca = rfc.predict(x_test_pca)
rfc_acc_pca = accuracy_score(y_test_pca, rfc_pred_pca)

# MLP
mlp.fit(x_train_pca, y_train_pca)
mlp_pred_pca = mlp.predict(x_test_pca)
mlp_acc_pca = accuracy_score(y_test_pca, mlp_pred_pca)

import matplotlib.pyplot as plt

# svc plot
plt.bar(['Before PCA', 'After PCA'],[svc_acc, svc_acc_pca])
plt.title('SVM Comparison')
plt.show()

# Random forest plot
plt.bar(['Before PCA', 'After PCA'],[rfc_acc, rfc_acc_pca])
plt.title('RFC Comparison')
plt.show()

# MLP
plt.bar(['Before PCA', 'After PCA'],[mlp_acc, mlp_acc_pca])
plt.title('MLP Comparison')
plt.show()

