<a href="https://colab.research.google.com/github/Arrnv/major-ai/blob/main/major_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [12]:
df = pd.read_csv('/content/drive/My Drive/Major_dataset/cardio_data_processed.csv')
df.count()

id                     68205
age                    68205
gender                 68205
height                 68205
weight                 68205
ap_hi                  68205
ap_lo                  68205
cholesterol            68205
gluc                   68205
smoke                  68205
alco                   68205
active                 68205
cardio                 68205
age_years              68205
bmi                    68205
bp_category            68205
bp_category_encoded    68205
dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
df = df.drop(columns=['bp_category_encoded'])

In [6]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,Hypertension Stage 1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,Hypertension Stage 2
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805,Hypertension Stage 1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,Hypertension Stage 2
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177,Normal


In [7]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.0-py3-none-any.whl (379 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m379.9/379.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.0


In [8]:
X = df.drop(['cardio'],axis=1)
y = df['cardio']

In [9]:
from sklearn.preprocessing import OneHotEncoder

# Define a function to preprocess data
def preprocess_data(df):
    # Perform label encoding for categorical features
    le = LabelEncoder()
    df['bp_category'] = le.fit_transform(df['bp_category'])

    # One-hot encode categorical features if needed
    # This step is only necessary if you prefer one-hot encoding over label encoding
    # If you prefer label encoding, you can remove this step
    ohe = OneHotEncoder()
    bp_category_encoded = ohe.fit_transform(df[['bp_category']]).toarray()
    bp_category_encoded_df = pd.DataFrame(bp_category_encoded, columns=[f'bp_category_{i}' for i in range(bp_category_encoded.shape[1])])

    # Concatenate the encoded features with the original DataFrame
    df = pd.concat([df.drop(columns=['bp_category']), bp_category_encoded_df], axis=1)

    X = df.drop(['cardio'], axis=1)
    y = df['cardio']
    return X, y


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin

# Define a custom transformer for preprocessing data
class CustomPreprocessor(TransformerMixin):
    def __init__(self):
        self.le = LabelEncoder()

    def fit(self, X, y=None):
        self.le.fit(X['bp_category'])
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy['bp_category'] = self.le.transform(X_copy['bp_category'])
        return X_copy

# Define XGBoost objective function for Optuna optimization
def objective(trial, X_train, X_test, y_train, y_test):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-4, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-4, 1.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "eta": trial.suggest_float("eta", 1e-2, 0.5, log=True),
    }

    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Preprocess data
X, y = preprocess_data(df)

# Split preprocessed data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Define the pipeline
pipeline = Pipeline([
    ('preprocess', CustomPreprocessor()),  # Use the custom preprocessor
    ('clf', xgb.XGBClassifier()),  # Final classifier
])

# Define Optuna study for hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train, X_test, y_train, y_test), n_trials=10)  # Decreased number of trials

# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Set the best hyperparameters to the classifier
pipeline.named_steps['clf'].set_params(**best_params)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the pipeline
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)


[I 2024-03-29 08:08:38,410] A new study created in memory with name: no-name-9bb996fa-c94f-4170-bee0-49f99c441865
[I 2024-03-29 08:09:38,908] Trial 0 finished with value: 0.7293519695044473 and parameters: {'booster': 'dart', 'lambda': 0.049497563535252205, 'alpha': 0.3477116186505048, 'max_depth': 6, 'eta': 0.010981841842673751}. Best is trial 0 with value: 0.7293519695044473.
[I 2024-03-29 08:09:39,515] Trial 1 finished with value: 0.7278858371615677 and parameters: {'booster': 'gbtree', 'lambda': 0.03031790196726632, 'alpha': 0.427601584163803, 'max_depth': 3, 'eta': 0.46398094880864693}. Best is trial 0 with value: 0.7293519695044473.
[I 2024-03-29 08:10:43,667] Trial 2 finished with value: 0.730231648910175 and parameters: {'booster': 'dart', 'lambda': 0.40034403281956654, 'alpha': 0.7415250770608929, 'max_depth': 6, 'eta': 0.07425236117420708}. Best is trial 2 with value: 0.730231648910175.
[I 2024-03-29 08:10:44,097] Trial 3 finished with value: 0.7283745479425277 and parameters

Best Hyperparameters: {'booster': 'gbtree', 'lambda': 0.0034527850930582, 'alpha': 0.11377615304615821, 'max_depth': 6, 'eta': 0.0388523334687521}
Accuracy: 0.7310135861597107


In [14]:
import joblib

# Train your XGBoost model (already done in your code)
# Save the model
joblib.dump(pipeline, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [15]:
import os

# Get the current working directory
current_directory = os.getcwd()

# List files in the current directory
files_in_directory = os.listdir(current_directory)

# Print the current directory and list of files
print("Current directory:", current_directory)
print("Files in directory:", files_in_directory)

Current directory: /content
Files in directory: ['.config', 'drive', 'xgboost_model.pkl', 'sample_data']


# New Section

In [19]:
import joblib

# Save the model to Google Drive
joblib.dump(pipeline, '/content/drive/My Drive/Major_dataset/xgboost_model.pkl')

['/content/drive/My Drive/Major_dataset/xgboost_model.pkl']