# Importing Dependencies

In [1]:
import sys # Not Required
import warnings
import os
import pickle
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
import os

# Ignoring Warnings
warnings.filterwarnings(action='ignore')

# Fixing matplotlib inline and label sizes
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# File Locations for Images, Dataset, Pickles and Models

In [2]:
# Root Directory
PROJECT_ROOT_DIR = "\\".join(os.getcwd().split('\\')[:-1])

# Images Directory
IMAGES_DIR = 'images'
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, IMAGES_DIR)


def save_fig(fig_id, tight_layout=True, fig_extension='png', resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + '.' + fig_extension)
    print('Saving figure', path)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
    
# Dataset Directory 
DATASET_NAME = 'cardiovascular-disease-dataset.csv'
DATASET_DIR = 'datasets'
DATASET_PATH = os.path.join(PROJECT_ROOT_DIR, DATASET_DIR)


def load_dataset(path=DATASET_PATH, filename=DATASET_NAME, sep=';'):
    dataset_location = os.path.join(path, filename)
    return pd.read_csv(dataset_location, sep)

# Pickle and Model Directory
PM_DIR = 'Pickles_And_Models'
PM_PATH = os.path.join(PROJECT_ROOT_DIR, PM_DIR)

def save_object(object_ , pickle_name, pm_path = PM_PATH):
    path = os.path.join(pm_path, pickle_name)
    pickle.dump(object_, open(path, 'wb'))
    print('Saving Pickle', path)
    
def load_object(pickle_name, pm_path = PM_PATH):
    path = os.path.join(pm_path, pickle_name)
    object_ = pickle.load(open(path, 'rb'))
    print('Loaded Pickle', path)
    return object_

# Load Dataset

In [3]:
cardio = load_dataset()
cardio.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [4]:
cardio.set_index('id',inplace=True)
cardio.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


# Data Insights

In [5]:
cardio.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.9 MB


In [6]:
cardio.describe(include='all')

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [None]:
datasets_folder = "\\".join(os.getcwd().split('\\')[:-1]) + "\\datasets\\"
datasets_folder

In [None]:
df = pd.read_csv(datasets_folder+'cardio_train.csv',sep=';').set_index('id')

In [None]:
df

We will try to normalize **gender** column by making 0 -> Women and 1 -> Men 

In [None]:
df['gender'] = df['gender'].apply(lambda x: 1 if x == 1 else 0 )

In [None]:
df['chol_2'] = df['cholesterol'].apply(lambda x: 1 if x == 2 else 0)
df['chol_3'] = df['cholesterol'].apply(lambda x: 1 if x == 3 else 0)
df.drop('cholesterol',axis=1,inplace=True)

In [None]:
fig,heatMP = plt.subplots(1,1,figsize=(10,8))
sns.heatmap(df.corr(),annot=True,fmt='1.2f',ax=heatMP);

In [None]:
sns.countplot(x='cardio',data=df,)

In [None]:
df.groupby(by=['cardio','gender'])['gender'].count()

In [None]:
df.gender.value_counts()

In [None]:
fig, ax = plt.subplots(1,2,figsize=(8,5))
df['cardio'][df['gender']==1].value_counts().plot.pie(explode=[0,0.2],autopct='%1.1f%%',ax=ax[0],shadow=True)
df['cardio'][df['gender']==0].value_counts().plot.pie(explode=[0,0.2],autopct='%1.1f%%',ax=ax[1],shadow=True)

# Let's get into some bull shit

In [None]:
print('Original shape :',df.shape)

In [None]:
df_train , df_test = train_test_split(df,test_size=0.2,random_state=42)

In [None]:
print('DF Train :',df_train.shape)
print('DF Test  :',df_test.shape)

In [None]:
df

# Dataset Scaling using Sklearn StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(df_train.drop('cardio',axis=1))

In [None]:
scaled_features = scaler.transform(df_train.drop('cardio',axis=1))
df_train_sc = pd.DataFrame(data=scaled_features,columns=df_train.columns.drop('cardio'))

In [None]:
scaled_features = scaler.transform(df_test.drop('cardio',axis=1))
df_test_sc = pd.DataFrame(data=scaled_features,columns=df_test.columns.drop('cardio'))

# Splitting of Training and even Scaled data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Full / Entire Data - Non Scaled Data
X_train_all , Y_train_all = df_train.drop('cardio',axis=1) , df_train.cardio
X_test_all , Y_test_all = df_test.drop('cardio',axis=1) , df_test.cardio

# Full / Entire Data - Scaled Data
X_train_all_sc , Y_train_all_sc = df_train_sc , df_train.cardio
X_test_all_sc , Y_test_all_sc = df_test_sc , df_train.cardio

# for Dev and Validation sets 
# Non Scaled Data
df_train_DEV , df_train_Validation = train_test_split(df_train,test_size=0.1,random_state=7)
X_train , Y_train = df_train_DEV.drop('cardio',axis=1) , df_train_DEV.cardio
X_test , Y_test = df_train_Validation.drop('cardio',axis=1) , df_train_Validation.cardio

# Scaled Data
X_train_sc , X_test_sc = train_test_split(df_train_sc,test_size=0.1,random_state=7)
Y_train_sc , Y_test_sc = train_test_split(df_train.cardio,test_size=0.1,random_state=7)

# Machine Learning Algorithms

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_sc,Y_train_sc)
pred_logreg = logreg.predict(X_test_sc)

In [None]:
print(confusion_matrix(Y_test_sc,pred_logreg))
print(classification_report(Y_test_sc,pred_logreg))
print(accuracy_score(Y_test_sc,pred_logreg))

## Gaussian Naive Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_sc,Y_train_sc)
pred_gnb = gnb.predict(X_test_sc)

In [None]:
print(confusion_matrix(Y_test_sc,pred_gnb))
print(classification_report(Y_test_sc,pred_gnb))
print(accuracy_score(Y_test_sc,pred_gnb))

## K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#### Default Mode : K -> 5

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_sc,Y_train_sc)
pred_knn = knn.predict(X_test_sc)

In [None]:
print(confusion_matrix(Y_test_sc,pred_knn))
print(classification_report(Y_test_sc,pred_knn))
print(accuracy_score(Y_test_sc,pred_knn))

### Iteration Mode : k -> List

In [None]:
Scores = {}
for k in [1,5,10,25,50,75,100,250,500,1000]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_sc,Y_train_sc)
    pred_knn = knn.predict(X_test_sc)
    Scores[k] = accuracy_score(Y_test_sc,pred_knn)

In [None]:
Score_table = pd.DataFrame(Scores.values(),index=Scores.keys(),columns=['Accuracy'])

In [None]:
plt.rcdefaults()
fig, ax = plt.subplots()

ax.barh(range(len(Score_table)), Score_table.Accuracy*100, align='center')
ax.set_yticks(range(len(Score_table)))
ax.set_yticklabels(Score_table.index)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Accuracy ->')
ax.set_title('Which K nearest Neighbor is better?')

plt.show()

## SVM Classifier

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC(gamma=0.1,C=100)

In [None]:
svc.fit(X_train_sc,Y_train_sc)

In [None]:
pred_svc = svc.predict(X_test_sc)
print(confusion_matrix(Y_test_sc,pred_svc))
print(classification_report(Y_test_sc,pred_svc))
print(accuracy_score(Y_test_sc,pred_svc))