In [None]:
! pip install kfp



In [None]:
!pip install google-cloud-pipeline-components



In [None]:
!pip install gcsfs



In [None]:
from google.cloud import aiplatform
aiplatform.init(project='final-project-ise-543', location='us-central1')

from kfp.v2 import dsl
from kfp.v2.dsl import component, InputPath, OutputPath, pipeline

  from kfp.v2 import dsl


In [None]:
from kfp.v2.dsl import component, InputPath, OutputPath


@component(packages_to_install=["scikit-learn", "pandas"])
def load_Dataset(input_dataset_path: str, output_dataset_path : OutputPath('Dataset') ):
    import pandas as pd
    df = pd.read_csv(input_dataset_path)
    df = pd.get_dummies(df, drop_first=True)
    df.to_csv(output_dataset_path, index=False)

  return component_factory.create_component_from_func(


In [None]:
from kfp.v2.dsl import InputPath, OutputPath, Dataset

@component(packages_to_install=["pandas", "numpy", "fsspec", "gcsfs"])
def perform_initial_data_preparation(input_dataset_path: str, output_dataset_path: OutputPath(Dataset)):
    import pandas as pd
    import numpy as np

    data = pd.read_csv(input_dataset_path)

    # Binning age into categories
    age_bins = [0, 35, 55, 100]  # Define age bins
    age_labels = ['Young', 'Middle-aged', 'Senior']
    data['age_group'] = pd.cut(data['age'], bins=age_bins, labels=age_labels, right=False)

    #  Binning cigarettes per day into smoker categories
    cig_bins = [-1, 0, 10, 20, float('inf')]  # Define cigarette bins
    cig_labels = ['Non-smoker', 'Light smoker', 'Moderate smoker', 'Heavy smoker']
    data['smoker_type'] = pd.cut(data['cigsPerDay'], bins=cig_bins, labels=cig_labels, right=True)

    # Log transformation of income and blood pressure, handling cases where value might be zero
    data['log_income'] = np.log(data['income'] + 1)  # Adding 1 to avoid log(0)
    data['log_sysBP'] = np.log(data['sysBP'])
    data['log_diaBP'] = np.log(data['diaBP'])

    # Perform one-hot encoding on categorical variables
    data = pd.get_dummies(data, drop_first=True)

    # Convert 'demog Customer Age' to an integer
    # df["demog Customer Age"] = df["demog Customer Age"].astype(int)

    data.to_csv(output_dataset_path, index=False)


In [None]:
from kfp.v2.dsl import component, InputPath, OutputPath


@component(packages_to_install=["scikit-learn", "pandas"])
def split_dataset(input_dataset_path: InputPath('Dataset'),
                  train_data_path: OutputPath('Dataset'),
                  validation_data_path: OutputPath('Dataset')):
    from sklearn.model_selection import train_test_split
    import pandas as pd
    df = pd.read_csv(input_dataset_path)

    train_data, validation_data = train_test_split(df, test_size=0.20, random_state=42)

    train_data.to_csv(train_data_path, index=False)

    validation_data.to_csv(validation_data_path, index=False)

In [None]:
from kfp.v2.dsl import InputPath, OutputPath, Output, Artifact, component

@component(packages_to_install=["pandas"])
def fill_missing_values(train_data_path: InputPath('Dataset'),
                        completed_data_path: OutputPath('Dataset'),
                        average_cig: Output[Artifact],
                        median_BP: Output[Artifact],
                        median_education: Output[Artifact],
                        average_chol: Output[Artifact],
                        average_BMI: Output[Artifact],
                        average_glucose: Output[Artifact],
                        average_a1c: Output[Artifact],
                        average_heart_rate: Output[Artifact]):
    import pandas as pd

    # Loading the dataset
    data_frame = pd.read_csv(train_data_path)

    # Removing patientID as it's not used in calculations
    data_frame.drop(['patientID'], axis=1, inplace=True)

    # Calculating means and medians for imputation
    mean_cigs_per_day = data_frame['cigsPerDay'].mean()
    median_BP_meds = data_frame['BPMeds'].median()
    median_education_level = data_frame['education'].median()
    mean_total_chol = data_frame['totChol'].mean()
    mean_BMI = data_frame['BMI'].mean()
    mean_glucose_level = data_frame['glucose'].mean()
    mean_a1c_level = data_frame['a1c'].mean()
    mean_heart_rate = data_frame['heartRate'].mean()

    # Imputing missing values
    data_frame['cigsPerDay'].fillna(mean_cigs_per_day, inplace=True)
    data_frame['BPMeds'].fillna(median_BP_meds, inplace=True)
    data_frame['education'].fillna(median_education_level, inplace=True)
    data_frame['totChol'].fillna(mean_total_chol, inplace=True)
    data_frame['BMI'].fillna(mean_BMI, inplace=True)
    data_frame['glucose'].fillna(mean_glucose_level, inplace=True)
    data_frame['a1c'].fillna(mean_a1c_level, inplace=True)
    data_frame['heartRate'].fillna(mean_heart_rate, inplace=True)

    # Saving the modified dataset
    data_frame.to_csv(completed_data_path, index=False)

    # Storing calculated statistics in output artifacts
    average_cig.metadata['value'] = mean_cigs_per_day
    median_BP.metadata['value'] = median_BP_meds
    median_education.metadata['value'] = median_education_level
    average_chol.metadata['value'] = mean_total_chol
    average_BMI.metadata['value'] = mean_BMI
    average_glucose.metadata['value'] = mean_glucose_level
    average_a1c.metadata['value'] = mean_a1c_level
    average_heart_rate.metadata['value'] = mean_heart_rate


In [None]:
from kfp.v2.dsl import InputPath, OutputPath, Input, Model, component

@component(packages_to_install=["pandas"])
def correct_missing_values(validation_data_path: InputPath('Dataset'),
                           corrected_data_path: OutputPath('Dataset'),
                           average_cigs: Input[Artifact],
                           median_BP: Input[Artifact],
                           median_education: Input[Artifact],
                           average_chol: Input[Artifact],
                           average_BMI: Input[Artifact],
                           average_glucose: Input[Artifact],
                           average_a1c: Input[Artifact],
                           average_heart_rate: Input[Artifact]):
    import pandas as pd

    # Loading the validation data
    data_frame = pd.read_csv(validation_data_path)

    # Removing patient ID as it is not needed for analysis
    data_frame.drop(['patientID'], axis=1, inplace=True)

    # Using provided mean/median values for imputation
    data_frame['cigsPerDay'].fillna(average_cigs.metadata['value'], inplace=True)
    data_frame['BPMeds'].fillna(median_BP.metadata['value'], inplace=True)
    data_frame['education'].fillna(median_education.metadata['value'], inplace=True)
    data_frame['totChol'].fillna(average_chol.metadata['value'], inplace=True)
    data_frame['BMI'].fillna(average_BMI.metadata['value'], inplace=True)
    data_frame['glucose'].fillna(average_glucose.metadata['value'], inplace=True)
    data_frame['a1c'].fillna(average_a1c.metadata['value'], inplace=True)
    data_frame['heartRate'].fillna(average_heart_rate.metadata['value'], inplace=True)

    # Outputting the corrected dataset
    data_frame.to_csv(corrected_data_path, index=False)


In [None]:
@component(packages_to_install=["pandas", "numpy", "scikit-learn", "imbalanced-learn==0.11.0"])
def apply_SMOTE(input_file_path: InputPath('Dataset'), output_file_path: OutputPath('Dataset')):
    import pandas as pd
    from imblearn.over_sampling import SMOTE

    # Reading the dataset
    data = pd.read_csv(input_file_path)

    # Separating features and the target variable
    features = data.drop('TenYearCHD', axis=1)
    target = data['TenYearCHD']

    # Applying SMOTE to balance the dataset
    smote = SMOTE()
    features_balanced, target_balanced = smote.fit_resample(features, target)

    # Creating a DataFrame from the balanced features and target
    balanced_features = pd.DataFrame(features_balanced, columns=features.columns)
    balanced_target = pd.DataFrame(target_balanced, columns=['TenYearCHD'])

    # Combining the features and target into one DataFrame
    balanced_data = pd.concat([balanced_features, balanced_target], axis=1)

    # Saving the balanced dataset to the specified output path
    balanced_data.to_csv(output_file_path, index=False)


In [None]:
@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def train_decision_tree(training_dataset_path: InputPath('Dataset'),
                        trained_model_artifact: Output[Model]):

    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    import joblib
    import os

    # Load the training data
    train_df = pd.read_csv(training_dataset_path)

    X_train = train_df.drop('TenYearCHD', axis=1)
    y_train = train_df['TenYearCHD']

    # Initialize and train the decision tree model
    decision_tree_model = DecisionTreeClassifier()
    decision_tree_model.fit(X_train, y_train)

    # Save the trained model to the designated output path
    os.makedirs(trained_model_artifact.path, exist_ok=True)
    joblib.dump(decision_tree_model, os.path.join(trained_model_artifact.path, "decision_tree_model.joblib"))


In [None]:
from kfp.v2.dsl import Metrics

@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def evaluate_model(test_dataset_path: InputPath('Dataset'),
                   model: Input[Model],
                   metrics: Output[Metrics]):

    import pandas as pd
    import joblib
    from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

    # Load the test dataset
    test_df = pd.read_csv(test_dataset_path)
    X_test = test_df.drop(columns=['TenYearCHD'])
    y_test = test_df['TenYearCHD']

    # Load the trained model
    model_file_path = model.path + "/decision_tree_model.joblib"
    trained_model = joblib.load(model_file_path)

    # Make predictions
    y_pred = trained_model.predict(X_test)

    # Calculate the confusion matrix and extract components
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log each component of the confusion matrix separately
    metrics.log_metric("accuracy", accuracy)
    metrics.log_metric("f1_score", f1)
    metrics.log_metric("true_negatives", int(tn))
    metrics.log_metric("false_positives", int(fp))
    metrics.log_metric("false_negatives", int(fn))
    metrics.log_metric("true_positives", int(tp))


In [None]:
from kfp.v2.dsl import pipeline, Output, Dataset, component

@pipeline(name='Final-project-pipeline')
def final_project_pipeline(training_dataset_path: str):

    # Process training dataset - initial data preparation
    load_data = perform_initial_data_preparation(input_dataset_path = training_dataset_path)
    split_result = split_dataset(input_dataset_path=load_data.output)

    # Process training dataset - impute age and other features
    imputed_training_data = fill_missing_values(
        train_data_path=split_result.outputs['train_data_path'])

    # Impute age and other features in validation dataset using the same means/medians from training data
    imputed_validation_data = correct_missing_values(
        validation_data_path=split_result.outputs['validation_data_path'],
        average_cigs=imputed_training_data.outputs['average_cig'],
        median_BP=imputed_training_data.outputs['median_BP'],
        median_education=imputed_training_data.outputs['median_education'],
        average_chol=imputed_training_data.outputs['average_chol'],
        average_BMI=imputed_training_data.outputs['average_BMI'],
        average_glucose=imputed_training_data.outputs['average_glucose'],
        average_a1c=imputed_training_data.outputs['average_a1c'],
        average_heart_rate=imputed_training_data.outputs['average_heart_rate'])

    # Perform SMOTE oversampling on the imputed training dataset
    oversampled_training_data = apply_SMOTE(
        input_file_path=imputed_training_data.outputs['completed_data_path'])

    # Train a Decision Tree model
    trained_model = train_decision_tree(
        training_dataset_path=oversampled_training_data.outputs['output_file_path'])

    # Evaluate the model using the imputed validation dataset
    evaluate_model(
        test_dataset_path=imputed_validation_data.outputs['corrected_data_path'],
        model=trained_model.outputs['trained_model_artifact'])


In [None]:
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=final_project_pipeline,
    package_path = 'final_pipeline.json'
)

pipeline_job = aiplatform.PipelineJob(
    display_name='final_modeling_pipeline',
    template_path='final_pipeline.json',
    pipeline_root='gs://final-project-ise-543',
    parameter_values={
      'training_dataset_path': 'gs://final_project_ise_543/Final Project Dataset.csv'},
    enable_caching=True
)

In [None]:
pipeline_job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/67845614716/locations/us-central1/pipelineJobs/final-project-pipeline-20240501023307
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/67845614716/locations/us-central1/pipelineJobs/final-project-pipeline-20240501023307')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/final-project-pipeline-20240501023307?project=67845614716
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/67845614716/locations/us-central1/pipelineJobs/final-project-pipeline-20240501023307 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/67845614716/locations/us-cent