In [132]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


import warnings
warnings.filterwarnings("ignore")

DATA PROCESSING 

In [133]:
df = pd.read_csv('cardio_train.csv',sep=';')
df.head() 

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [135]:
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [144]:
# Check for missing values
missing_data = df.isnull().sum()

# Print the results
print("Missing data:\n", missing_data)

Missing data:
 id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


In [136]:
# code in this cell taken from eda.ipynb
# remove outliers in ap_hi and ap_lo
cleaned_df = df[df['ap_hi'] < 250]  
cleaned_df = cleaned_df[cleaned_df['ap_lo'] < 250] 

print(cleaned_df.shape)

(69007, 13)


In [137]:

cont_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
ord_cols = ['cholesterol', 'gluc']

for col in ord_cols:
    dummies = pd.get_dummies(cleaned_df[col])
    dummies.columns = ['{0}_{1}'.format(col, ind) for ind in dummies.columns]
    cleaned_df = pd.concat([cleaned_df, dummies], axis=1)
    
cleaned_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,1,0,0,1,0,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,0,0,1,1,0,0
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,0,0,1,1,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,1,0,0,1,0,0
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,1,0,0,1,0,0


In [138]:
cleaned_df['gender'] -= 1  # convert gender values to be either 1 or 0
cleaned_df['age'] //= 356  # convert age values to be in years
cleaned_df = cleaned_df.drop(labels=['cholesterol', 'gluc', 'id'], axis=1)
cleaned_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,51,1,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0
1,56,0,156,85.0,140,90,0,0,1,1,0,0,1,1,0,0
2,52,0,165,64.0,130,70,0,0,0,1,0,0,1,1,0,0
3,49,1,169,82.0,150,100,0,0,1,1,1,0,0,1,0,0
4,49,0,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0


Using standardization and feature selection

In [139]:
from sklearn.preprocessing import StandardScaler

# Define which columns to standardize
cont_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']

# Standardize the continuous variables
df_standardized = cleaned_df.copy()
scaler = StandardScaler()
df_standardized[cont_cols] = scaler.fit_transform(df_standardized[cont_cols])

# Concatenate the standardized data with selected columns from the original DataFrame
cols_to_concat = ['gender', 'smoke', 'alco', 'active', 'cardio', 'cholesterol_1', 'cholesterol_2', 'cholesterol_3', 'gluc_1', 'gluc_2', 'bmi', 'bp']
# View the first 10 rows of the standardized DataFrame
df_standardized.head(10)


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,-0.457433,1,0.443739,-0.845719,-0.911044,-0.134641,0,0,1,0,1,0,0,1,0,0
1,0.263116,0,-1.018977,0.759488,0.766154,0.876554,0,0,1,1,0,0,1,1,0,0
2,-0.313323,0,0.07806,-0.706136,0.207088,-1.145835,0,0,0,1,0,0,1,1,0,0
3,-0.745653,1,0.565632,0.550113,1.32522,1.887748,0,0,1,1,1,0,0,1,0,0
4,-0.745653,0,-1.018977,-1.264469,-1.47011,-2.157029,0,0,0,0,1,0,0,1,0,0
5,0.983664,0,-1.628442,-0.496761,-0.351978,-0.134641,0,0,0,0,0,1,0,0,1,0
6,1.127774,0,-0.897084,1.317821,0.207088,-0.134641,0,0,1,0,0,0,1,1,0,0
7,1.271884,1,1.662669,1.457404,0.207088,0.876554,0,0,1,1,0,0,1,0,0,1
8,-0.745653,0,-0.775191,-0.217595,-0.911044,-1.145835,0,0,1,0,1,0,0,1,0,0
9,0.119006,0,-0.043833,-0.42697,-0.911044,-2.157029,0,0,0,0,1,0,0,1,0,0


In [140]:
from sklearn.feature_selection import SelectKBest, f_classif

# Define X and y
X = df.drop(['cardio'], axis=1)
y = df['cardio']

# Select the top 10 features using the ANOVA F-value as the score function
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

# Get the selected feature names
selected_features = X.columns[selector.get_support()]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Print the shape of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
print("Selected features:", selected_features)


Training set shape: (56000, 10) (56000,)
Testing set shape: (14000, 10) (14000,)
Selected features: Index(['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'active'],
      dtype='object')


Logistic Regression 

In [141]:
# Define the hyperparameters for grid search
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Create a logistic regression classifier
clf = LogisticRegression()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

# Train the model using the best hyperparameters
clf = LogisticRegression(**best_params)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn = cm[0, 0]
fp = cm[0, 1]
fn = cm[1, 0]
tp = cm[1, 1]

# Calculate sensitivity and specificity
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

# Print the results
print("Best hyperparameters: ", best_params)
print("Accuracy: {:.3f}".format(acc))
print("Sensitivity: {:.3f}".format(sensitivity))
print("Specificity: {:.3f}".format(specificity))


Best hyperparameters:  {'C': 0.1, 'penalty': 'l2'}
Accuracy: 0.697
Sensitivity: 0.664
Specificity: 0.730


Naive Bayes Classifier 

In [142]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their ranges to search
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# Create a classifier instance
clf = GaussianNB()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and their scores
print("Best parameters: ", grid_search.best_params_)
print("Best score: {:.3f}".format(grid_search.best_score_))

# Use the best hyperparameters to make predictions on the test set
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn = cm[0, 0]
fp = cm[0, 1]
fn = cm[1, 0]
tp = cm[1, 1]

# Calculate sensitivity and specificity
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

# Print the results
print("Accuracy: {:.3f}".format(acc))
print("Sensitivity: {:.3f}".format(sensitivity))
print("Specificity: {:.3f}".format(specificity))

Best parameters:  {'var_smoothing': 1e-09}
Best score: 0.596
Accuracy: 0.593
Sensitivity: 0.314
Specificity: 0.874
