# Cardiovascular Disease Diagnosis

### Load Dataset


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv("cardio_train.csv", delimiter=';')
df.head()

In [None]:
df.shape

In [None]:
df.describe()

### Visualization

In [None]:
df['cardio'].value_counts()

In [None]:
df['years'] = (df['age'] / 365).round(0) 
df['years'] = pd.to_numeric(df['years'], downcast='integer')

In [None]:
plt.figure(figsize=(16,12))
sns.countplot(x='years', hue='cardio', data=df)

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(df.corr(), annot=True, fmt= '.2f')

### Preprocess

In [None]:
df = df.drop(['id', 'age'], axis=1)

In [None]:
X = df.drop(['cardio'], axis=1)
y = df['cardio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training

In [None]:
model = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=1)
model.fit(X_train, y_train)

### Evaluation

In [None]:
model.score(X_test, y_test)

In [None]:
cm = confusion_matrix(y_test, model.predict(X_test))
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]

In [None]:
sen = TP / (TP + FN)
spec = TN / (FP + TN)
ppv = TP / (TP + FP)
npv = TN / (TN + FN)

print("sensitivity: ", sen)
print("specificity: ", spec)
print("positive predictive value: ", ppv)
print("negative predictive value: ", npv)

In [None]:
'''
Inspiration
1. https://www.youtube.com/watch?v=kySc5Wg1Gxw
'''