In [236]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

In [237]:
data = pd.read_csv("framingham.csv")

In [238]:
data.dtypes

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

In [239]:
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [240]:
# Imputer for mean
mean_imputer = SimpleImputer(strategy='mean')

# Columns to impute with mean (normally distributed/symmetric)
mean_cols = ['heartRate']

# Impute the columns with mean
data[mean_cols] = mean_imputer.fit_transform(data[mean_cols])

In [241]:
# Imputer for median
median_imputer = SimpleImputer(strategy='median')

# Columns to impute with median (skewed distributions or categorical)
median_cols = ['education', 'cigsPerDay', 'BPMeds', 'totChol', 'BMI', 'glucose']

# Impute the columns with median
data[median_cols] = median_imputer.fit_transform(data[median_cols])

In [242]:
# Checking if there are any missing values left
data.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [243]:
target = "TenYearCHD"
x = data.drop(target, axis=1)
y = data[target]

split data

In [244]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)

scaler với standard scaler

In [245]:
num_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

tạo bộ tiền xử lý dữ liệu, chỉ xử lý dữ liệu mang ý nghĩa là numerical, còn nominal features của bộ dữ liệu này không cần tiền xử lý

In [246]:
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["age", "cigsPerDay", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"])
])

xây dựng mô hình

In [247]:
logReg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LogisticRegression())
])
logReg.fit(x_train, y_train)

dự đoán kết quả và tính metric

In [248]:
y_predict = logReg.predict(x_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      1084
           1       0.61      0.07      0.13       188

    accuracy                           0.86      1272
   macro avg       0.73      0.53      0.53      1272
weighted avg       0.82      0.86      0.80      1272



In [249]:
cm = np.array(confusion_matrix(y_test, y_predict, labels=[0, 1]))
confusion = pd.DataFrame(cm, index=["khong", "co"], columns=["khong", "co"])
confusion

Unnamed: 0,khong,co
khong,1075,9
co,174,14
