In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("cardio_train.csv")

In [3]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,988,22469,1,155,69.0,130,80,2,2,0,0,1,0
1,989,14648,1,163,71.0,110,70,1,1,0,0,1,1
2,990,21901,1,165,70.0,120,80,1,1,0,0,1,0
3,991,14549,2,165,85.0,120,80,1,1,1,1,1,0
4,992,23393,1,155,62.0,120,80,1,1,0,0,1,0


In [4]:
# Feature selection and target variable
X = df.drop(['id', 'cardio'], axis=1)
y = df['cardio']


In [17]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# List of models to train
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))

 # print(f"{name} accuracy: {accuracy_score(y_test, y_pred):.2f}")

Logistic Regression accuracy: 0.72
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      6935
           1       0.74      0.68      0.71      6926

    accuracy                           0.72     13861
   macro avg       0.72      0.72      0.72     13861
weighted avg       0.72      0.72      0.72     13861

Decision Tree accuracy: 0.63
              precision    recall  f1-score   support

           0       0.63      0.64      0.64      6935
           1       0.64      0.63      0.63      6926

    accuracy                           0.63     13861
   macro avg       0.63      0.63      0.63     13861
weighted avg       0.63      0.63      0.63     13861

Random Forest accuracy: 0.71
              precision    recall  f1-score   support

           0       0.71      0.73      0.72      6935
           1       0.72      0.70      0.71      6926

    accuracy                           0.71     13861
   macro avg       0.71      0.71  

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [24]:
# Feature selection and target variable
X = df.drop(['id', 'cardio'], axis=1)
y = df['cardio']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter grids for each model
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    "Decision Tree": {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7]
    },
    # "SVM": {
    #     'C': [0.1, 1, 10, 100],
    #     'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    # },
    # "KNN": {
    #     'n_neighbors': [3, 5, 7, 9],
    #     'weights': ['uniform', 'distance'],
    #     'metric': ['euclidean', 'manhattan']
    # }
}

# Model classes
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    # "SVM": SVC(),
    # "KNN": KNeighborsClassifier()
}

# Perform GridSearchCV for each model
best_estimators = {}
for name, model in models.items():
    print(f"Training {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

# Evaluate the best models
for name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))

Training Logistic Regression...
Best parameters for Logistic Regression: {'C': 10, 'solver': 'liblinear'}
Training Decision Tree...
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5}
Training Random Forest...




Best parameters for Random Forest: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Training Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50}

Logistic Regression accuracy: 0.72
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      6935
           1       0.74      0.68      0.71      6926

    accuracy                           0.72     13861
   macro avg       0.72      0.72      0.72     13861
weighted avg       0.72      0.72      0.72     13861


Decision Tree accuracy: 0.73
              precision    recall  f1-score   support

           0       0.73      0.74      0.73      6935
           1       0.73      0.73      0.73      6926

    accuracy                           0.73     13861
   macro avg       0.73      0.73      0.73     13861
weighted avg       0.73      0.73      0.73     13861


Random Forest accuracy: 0.74
   

In [7]:
# Feature selection and target variable
X = df.drop(['id', 'cardio'], axis=1)
y = df['cardio']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter grids for each model
param_grids = {
    # "Logistic Regression": {
    #     'C': [0.01, 0.1, 1, 10, 100],
    #     'solver': ['lbfgs', 'liblinear']
    # },
    # "Decision Tree": {
    #     'criterion': ['gini', 'entropy'],
    #     'max_depth': [None, 10, 20, 30, 40, 50],
    #     'min_samples_split': [2, 5, 10]
    # },
    # "Random Forest": {
    #     'n_estimators': [50, 100, 200],
    #     'criterion': ['gini', 'entropy'],
    #     'max_depth': [None, 10, 20, 30],
    #     'min_samples_split': [2, 5, 10]
    # },
    # "Gradient Boosting": {
    #     'n_estimators': [50, 100, 200],
    #     'learning_rate': [0.01, 0.1, 0.2, 0.3],
    #     'max_depth': [3, 5, 7]
    # },
    "SVM": {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    # "KNN": {
    #     'n_neighbors': [3, 5, 7, 9],
    #     'weights': ['uniform', 'distance'],
    #     'metric': ['euclidean', 'manhattan']
    # }
}

# Model classes
models = {
    # "Logistic Regression": LogisticRegression(),
    # "Decision Tree": DecisionTreeClassifier(),
    # "Random Forest": RandomForestClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    # "KNN": KNeighborsClassifier()
}

# Perform GridSearchCV for each model
best_estimators = {}
for name, model in models.items():
    print(f"Training {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

# Evaluate the best models
for name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))

Training SVM...


KeyboardInterrupt: 

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier


# Feature selection and target variable
X = df.drop(['id', 'cardio'], axis=1)
y = df['cardio']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handling imbalanced data using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter grids for each model
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    "Decision Tree": {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7]
    },
    # "SVM": {
    #     'C': [0.1, 1, 10, 100],
    #     'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    # },
    "KNN": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    "XGBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7],
        'colsample_bytree': [0.3, 0.7]
    }
}

# Model classes
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    # "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier()
}

# Perform GridSearchCV for each model
best_estimators = {}
for name, model in models.items():
    print(f"Training {name}...")
    search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, n_jobs=-1, scoring='accuracy')
    search.fit(X_train, y_train)
    best_estimators[name] = search.best_estimator_
    print(f"Best parameters for {name}: {search.best_params_}")

# Evaluate the best models
for name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))

# Ensemble model: VotingClassifier
ensemble = VotingClassifier(estimators=[(name, model) for name, model in best_estimators.items()], voting='hard')
ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)
print(f"\nEnsemble accuracy: {accuracy_score(y_test, y_pred_ensemble):.2f}")
print(classification_report(y_test, y_pred_ensemble))


Training Logistic Regression...
Best parameters for Logistic Regression: {'C': 100, 'solver': 'lbfgs'}
Training Decision Tree...
Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2}
Training Random Forest...




Best parameters for Random Forest: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Training Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50}
Training KNN...
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Training XGBoost...
Best parameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}

Logistic Regression accuracy: 0.72
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      6935
           1       0.74      0.68      0.71      6926

    accuracy                           0.72     13861
   macro avg       0.72      0.72      0.72     13861
weighted avg       0.72      0.72      0.72     13861


Decision Tree accuracy: 0.73
              precision    recall  f1-score   support

           0       0.73      0.73      0.73      6935
           

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [18]:


# Feature selection and target variable
X = df.drop(['id', 'cardio'], axis=1)
y = df['cardio']


In [19]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handling imbalanced data using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)


In [20]:

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:

# Hyperparameter grids for each model
param_grids = {
#     "Logistic Regression": {
#         'C': [0.01, 0.1, 1, 10, 100],
#         'solver': ['lbfgs', 'liblinear']
#     },
#     "Decision Tree": {
#         'criterion': ['gini', 'entropy'],
#         'max_depth': [None, 10, 20, 30, 40, 50],
#         'min_samples_split': [2, 5, 10]
#     },
#     "Random Forest": {
#         'n_estimators': [50, 100, 200],
#         'criterion': ['gini', 'entropy'],
#         'max_depth': [None, 10, 20, 30],
#         'min_samples_split': [2, 5, 10]
#     },
#     "Gradient Boosting": {
#         'n_estimators': [50, 100, 200],
#         'learning_rate': [0.01, 0.1, 0.2, 0.3],
#         'max_depth': [3, 5, 7]
#     },
    "SVM": {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    # "KNN": {
    #     'n_neighbors': [3, 5, 7, 9],
    #     'weights': ['uniform', 'distance'],
    #     'metric': ['euclidean', 'manhattan']
    # },
    # "XGBoost": {
    #     'n_estimators': [50, 100, 200],
    #     'learning_rate': [0.01, 0.1, 0.2, 0.3],
    #     'max_depth': [3, 5, 7],
    #     'colsample_bytree': [0.3, 0.7]
    # }
}

# Model classes
models = {
    # "Logistic Regression": LogisticRegression(),
    # "Decision Tree": DecisionTreeClassifier(),
    # "Random Forest": RandomForestClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    # "KNN": KNeighborsClassifier(),
    # "XGBoost": XGBClassifier()
}

# Perform GridSearchCV for each model
best_estimators = {}
for name, model in models.items():
    print(f"Training {name}...")
    search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, n_jobs=-1, scoring='accuracy')
    search.fit(X_train, y_train)
    best_estimators[name] = search.best_estimator_
    print(f"Best parameters for {name}: {search.best_params_}")

# Evaluate the best models
for name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))

# Ensemble model: VotingClassifier
ensemble = VotingClassifier(estimators=[(name, model) for name, model in best_estimators.items()], voting='hard')
ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)
print(f"\nEnsemble accuracy: {accuracy_score(y_test, y_pred_ensemble):.2f}")
print(classification_report(y_test, y_pred_ensemble))


Training SVM...


