CardioVascular Disease Prediction Model


In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('cardio_train.csv', delimiter=';')

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


In [3]:
# Load the dataset (ensure you've uploaded the file in Colab)
df = pd.read_csv('cardio_train.csv', delimiter=';')


In [4]:
# Check for missing values
print(df.isnull().sum())

# Impute missing values with the mean (for numerical columns)
imputer = SimpleImputer(strategy='mean')
df[df.columns] = imputer.fit_transform(df[df.columns])


id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


In [5]:
# Assuming 'gender' is a categorical column (replace with actual column names)
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])  # Example for one column


In [6]:
# Separate features and target variable
X = df.drop('cardio', axis=1)  # Replace 'cardio' with the actual target column name
y = df['cardio']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [9]:
# Initialize the model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train, y_train)


In [10]:
# Use the trained model to make predictions on the test data
y_pred = model.predict(X_test)


In [11]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.72


In [12]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)


Confusion Matrix:
[[5361 1627]
 [2246 4766]]


In [13]:
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)


Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.77      0.73      6988
         1.0       0.75      0.68      0.71      7012

    accuracy                           0.72     14000
   macro avg       0.73      0.72      0.72     14000
weighted avg       0.73      0.72      0.72     14000



In [14]:
import joblib

# Save the model to a file
joblib.dump(model, 'cardio_model.pkl')


['cardio_model.pkl']

In [15]:
# Load the model from the file
loaded_model = joblib.load('cardio_model.pkl')

# Use the loaded model to make predictions
new_predictions = loaded_model.predict(X_test)


In [16]:
from sklearn.model_selection import GridSearchCV

# Example: Hyperparameter tuning for Logistic Regression
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear']}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

# Best parameters and best score
print(grid.best_params_)
print(grid.best_score_)


{'C': 100, 'solver': 'liblinear'}
0.7190357142857142


In [17]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5)
print(f'Cross-Validation Scores: {scores}')
print(f'Average CV Score: {scores.mean()}')


Cross-Validation Scores: [0.71982143 0.72142857 0.71660714 0.71633929 0.71848214]
Average CV Score: 0.7185357142857143


In [19]:
# Calculate coefficients as a proxy for feature importance
importances = model.coef_[0]

for i, v in enumerate(importances):
    print(f'Feature: {i}, Score: {v}')

Feature: 0, Score: -0.0024542360727644813
Feature: 1, Score: 0.3667879801379663
Feature: 2, Score: 0.012450428508736578
Feature: 3, Score: -0.04408417392820927
Feature: 4, Score: 0.21906425641853222
Feature: 5, Score: 5.76342799173179
Feature: 6, Score: 0.055333967889812236
Feature: 7, Score: 0.35623274139300604
Feature: 8, Score: -0.06184434307059993
Feature: 9, Score: -0.04006041490594852
Feature: 10, Score: -0.04335976480424736
Feature: 11, Score: -0.07164165579540772


In [20]:
coeffs = model.coef_
print(coeffs)


[[-2.45423607e-03  3.66787980e-01  1.24504285e-02 -4.40841739e-02
   2.19064256e-01  5.76342799e+00  5.53339679e-02  3.56232741e-01
  -6.18443431e-02 -4.00604149e-02 -4.33597648e-02 -7.16416558e-02]]
