In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('cardio_data.csv', delimiter=';')

In [3]:
data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [4]:
# Drop irrelevant columns
data.drop(['id'], axis=1, inplace=True)

In [5]:
data

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [6]:
# Convert age from days to years
data['age'] = (data['age'] / 365).round().astype(int)

In [7]:
data

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,52,1,165,64.0,130,70,3,1,0,0,0,1
3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,48,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,53,2,168,76.0,120,80,1,1,1,0,1,0
69996,62,1,158,126.0,140,90,2,2,0,0,1,1
69997,52,2,183,105.0,180,90,3,1,0,1,0,1
69998,61,1,163,72.0,135,80,1,2,0,0,0,1


In [8]:
# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['gender', 'cholesterol', 'gluc'])

In [9]:
data

Unnamed: 0,age,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,50,168,62.0,110,80,0,0,1,0,0,1,1,0,0,1,0,0
1,55,156,85.0,140,90,0,0,1,1,1,0,0,0,1,1,0,0
2,52,165,64.0,130,70,0,0,0,1,1,0,0,0,1,1,0,0
3,48,169,82.0,150,100,0,0,1,1,0,1,1,0,0,1,0,0
4,48,156,56.0,100,60,0,0,0,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,53,168,76.0,120,80,1,0,1,0,0,1,1,0,0,1,0,0
69996,62,158,126.0,140,90,0,0,1,1,1,0,0,1,0,0,1,0
69997,52,183,105.0,180,90,0,1,0,1,0,1,0,0,1,1,0,0
69998,61,163,72.0,135,80,0,0,0,1,1,0,1,0,0,0,1,0


In [10]:
# Split into features and labels
X = data.drop(['cardio'], axis=1)
y = data['cardio']

In [11]:
X

Unnamed: 0,age,height,weight,ap_hi,ap_lo,smoke,alco,active,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,50,168,62.0,110,80,0,0,1,0,1,1,0,0,1,0,0
1,55,156,85.0,140,90,0,0,1,1,0,0,0,1,1,0,0
2,52,165,64.0,130,70,0,0,0,1,0,0,0,1,1,0,0
3,48,169,82.0,150,100,0,0,1,0,1,1,0,0,1,0,0
4,48,156,56.0,100,60,0,0,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,53,168,76.0,120,80,1,0,1,0,1,1,0,0,1,0,0
69996,62,158,126.0,140,90,0,0,1,1,0,0,1,0,0,1,0
69997,52,183,105.0,180,90,0,1,0,0,1,0,0,1,1,0,0
69998,61,163,72.0,135,80,0,0,0,1,0,1,0,0,0,1,0


In [12]:
y

0        0
1        1
2        1
3        1
4        0
        ..
69995    0
69996    1
69997    1
69998    1
69999    0
Name: cardio, Length: 70000, dtype: int64

In [13]:
# Normalize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Train a logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [17]:
# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7214285714285714


In [18]:
import pickle

In [19]:
# Save the model
with open('cardio_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [20]:
# Load the model and make predictions on new data
with open('cardio_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [21]:
new_data = pd.DataFrame({'age': [50], 
                         'height': [165], 
                         'weight': [75], 
                         'ap_hi': [120], 
                         'ap_lo': [80],
                         'smoke': [0],
                         'alco' : [0],
                         'active' : [1],
                         'gender_1': [0], 
                         'gender_2': [1],
                         'cholesterol_1': [0],
                         'cholesterol_2': [1], 
                         'cholesterol_3': [0], 
                         'gluc_1': [1], 
                         'gluc_2': [0], 
                         'gluc_3': [0],
})

In [22]:
new_data = scaler.transform(new_data)

In [23]:
prediction = model.predict(new_data)
print("Prediction:", prediction)

Prediction: [0]
