# Load and initialize required libraries

In [None]:
! pip install kfp



In [None]:
!pip install google-cloud-pipeline-components



In [None]:
!pip install gcsfs



In [None]:
!pip install scikit-learn



#  Set parameters and initialize aiplatform client library

In [None]:
# Set parameters
project_id = 'ise543-module7-homework-418819'
location = 'us-central1'

In [None]:
from google.cloud import aiplatform
aiplatform.init(project=project_id, location=location)

from kfp.v2.dsl import pipeline, component

  from kfp.v2.dsl import pipeline, component


#Define components

## Common dataset preparation steps

In [None]:
from kfp.v2.dsl import InputPath, OutputPath, Dataset

@component(packages_to_install=["pandas", "numpy", "fsspec", "gcsfs"])
def perform_initial_data_preparation(input_dataset_path: str, output_dataset_path: OutputPath(Dataset)):
    import pandas as pd
    import numpy as np

    df = pd.read_csv(input_dataset_path)

    df['income_log'] = np.log1p(df['income'])
    df.drop(['income', 'glucose'], axis=1, inplace=True)


    df['education'].fillna(5, inplace=True)


    df.to_csv(output_dataset_path, index=False)


  return component_factory.create_component_from_func(


##One-Hot encoding

In [None]:
from kfp.v2.dsl import InputPath

@component(packages_to_install=["pandas", "scikit-learn"])
def onehot_encoding(dataset_path: InputPath('Dataset'),
                  output_path: OutputPath('Dataset')
                  ):
    import pandas as pd
    df = pd.read_csv(dataset_path)
    df_education = pd.get_dummies(df['education'],drop_first=True)
    df_education.columns = ['education_2', 'education_3', 'education_4', 'education_5']
    df_education = df_education.astype(int)

    df = pd.concat([df, df_education], axis=1)
    df.to_csv(output_path, index=False )

##Train-Test Split

In [None]:
from kfp.v2.dsl import InputPath

@component(packages_to_install=["pandas", "scikit-learn"])
def split_dataset(dataset_path: InputPath('Dataset'),
                  training_dataset_path: OutputPath('Dataset'),
                  validation_dataset_path: OutputPath('Dataset')):

    import pandas as pd
    from sklearn.model_selection import train_test_split

    df = pd.read_csv(dataset_path)
    train_df, val_df = train_test_split(df, test_size=0.20, random_state=42)
    train_df.to_csv(training_dataset_path, index=False)
    val_df.to_csv(validation_dataset_path, index=False)

##Outlier Handling Training Component

In [None]:
from kfp.v2.dsl import InputPath, Output, Artifact

@component(packages_to_install=["pandas", "scikit-learn", "numpy"])
def outlier_training(training_dataset_path: InputPath('Dataset'),
                  training_outlier_output_path: OutputPath('Dataset'),
                  iqr_values: Output[Artifact]):

    import pandas as pd
    import numpy as np

    df = pd.read_csv(training_dataset_path)

    columns = df.columns.tolist()

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    for col in df.columns:
      if col in ['a1c', 'cigsPerDay', 'income_log', 'totChol', 'sysBP',  'BMI']:
        df[col] = np.where(df[col] < lower_bound[col], lower_bound[col], df[col])
        df[col] = np.where(df[col] > upper_bound[col], upper_bound[col], df[col])

    df.to_csv(training_outlier_output_path, index=False)
    # Output the IQR values
    iqr_values.metadata['columns'] = ', '.join(df.columns.tolist())
    iqr_values.metadata['Q1'] = Q1.to_dict()
    iqr_values.metadata['Q3'] = Q3.to_dict()
    iqr_values.metadata['IQR'] = IQR.to_dict()

##Outlier Handling Validation component

In [None]:
from kfp.v2.dsl import Input
from kfp.v2.dsl import Model

@component(packages_to_install=["pandas"])
def outlier_validation(validation_dataset_path: InputPath('Dataset'),
                      validation_outlier_output_path: OutputPath('Dataset'),
                      iqr_values: Input[Artifact]):

    import pandas as pd
    import numpy as np
    # Load the test dataset
    df = pd.read_csv(validation_dataset_path)


    # Access the IQR values from the artifact metadata
    iqr_metadata = iqr_values.metadata
    Q1 = pd.Series(iqr_metadata['Q1'])
    Q3 = pd.Series(iqr_metadata['Q3'])
    IQR = pd.Series(iqr_metadata['IQR'])

    # Apply outlier detection based on the IQR values
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    for col in df.columns:
        if col in ['a1c', 'cigsPerDay', 'income_log', 'totChol', 'sysBP',  'BMI']:
          df[col] = np.where(df[col] < lower_bound[col], lower_bound[col], df[col])
          df[col] = np.where(df[col] > upper_bound[col], upper_bound[col], df[col])


    # Save the imputed dataframe to the output path
    df.to_csv(validation_outlier_output_path, index=False)

## Impute training component

In [None]:

from kfp.v2.dsl import Output
from kfp.v2.dsl import Artifact

@component(packages_to_install=["pandas"])
def impute_training(training_dataset_path: InputPath('Dataset'),
                   imputed_dataset_path: OutputPath('Dataset'),
                   imputed_values: Output[Artifact]):
    # Load the training dataset
    import pandas as pd
    df = pd.read_csv(training_dataset_path)

    # Columns to impute median values for
    columns_to_impute = ['cigsPerDay', 'BPMeds', 'totChol', 'BMI', 'heartRate', 'a1c']

    # Calculate and store median values for specified columns
    median_values = df[columns_to_impute].median()

    # Perform imputation for each column
    for col in columns_to_impute:
        df[col].fillna(median_values[col], inplace=True)

    # Save the imputed dataframe to the output path
    df.to_csv(imputed_dataset_path, index=False)

    # Output the median values
    imputed_values.metadata['columns'] = ', '.join(columns_to_impute)
    imputed_values.metadata['medians'] = median_values.to_dict()

## Impute validation component

In [None]:
from kfp.v2.dsl import Input
from kfp.v2.dsl import Model

@component(packages_to_install=["pandas"])
def impute_validation(validation_dataset_path: InputPath('Dataset'),
                      imputed_dataset_path: OutputPath('Dataset'),
                      imputed_values: Input[Artifact]):
    import pandas as pd
    # Load the test dataset
    df = pd.read_csv(validation_dataset_path)

    # Columns to impute median values for
    columns_to_impute = ['cigsPerDay', 'BPMeds', 'totChol', 'BMI', 'heartRate', 'a1c']

    # Impute missing values using the provided median values
    for col in columns_to_impute:
        df[col].fillna(imputed_values.metadata['medians'][col], inplace=True)

    # Save the imputed dataframe to the output path
    df.to_csv(imputed_dataset_path, index=False)

##Normalise Training

In [None]:
from kfp.v2.dsl import Output
from kfp.v2.dsl import Artifact

@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def normalise_training(
    training_dataset_path: InputPath('Dataset'),
    normalised_training_dataset_path: OutputPath('Dataset'),
    scaler_path: Output[Artifact]
):
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    import joblib

    # Load the training dataset
    training_df = pd.read_csv(training_dataset_path)

    # Define the columns to scale
    columns_to_scale = ['age',  'cigsPerDay', 'totChol', 'sysBP',  'BMI', 'heartRate',  'a1c', 'income_log', 'diaBP']

    # Separate the columns to be scaled
    X_train_scaled = training_df[columns_to_scale]

    scaler = StandardScaler()

    # Fit and transform the columns to be scaled
    X_train_scaled = scaler.fit_transform(X_train_scaled)

    training_df[columns_to_scale] = X_train_scaled


    # Save the normalised training DataFrame to the output path
    training_df.to_csv(normalised_training_dataset_path, index=False)

    # Save the scaler to the output path
    joblib.dump(scaler, scaler_path.path)


##Normalise Validation

In [None]:
from kfp.v2.dsl import Input
from kfp.v2.dsl import Artifact

@component(packages_to_install=["pandas", "scikit-learn","joblib"])
def normalise_validation(
                       validation_dataset_path: InputPath('Dataset'),
                       scaler_path: Input[Artifact],
                      normalised_validation_dataset_path: OutputPath('Dataset')
                       ):

    # Load the training dataset
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    import joblib

    scaler = joblib.load(scaler_path.path)

    validation_df = pd.read_csv(validation_dataset_path)

    # Define the columns to scale
    columns_to_scale = ['age',  'cigsPerDay', 'totChol', 'sysBP',  'BMI', 'heartRate',  'a1c', 'income_log', 'diaBP']

    # Separate the columns to be scaled
    X_val_scaled = validation_df[columns_to_scale]


    X_validation_normalized = scaler.transform(X_val_scaled)

    validation_df[columns_to_scale] = X_validation_normalized

    # Save the imputed dataframe to the output path
    validation_df.to_csv(normalised_validation_dataset_path, index=False)

#  Perform SMOTE oversampling on training partition

In [None]:
@component(packages_to_install=["pandas", "numpy", "scikit-learn", "imbalanced-learn==0.11.0"])
def perform_SMOTE(imputed_training_path:  InputPath('Dataset'),
                  smote_output_path: OutputPath('Dataset')):
    import pandas as pd
    import numpy as np
    from imblearn.over_sampling import SMOTE

    # Load the input dataset
    df = pd.read_csv(imputed_training_path)


    X = df.drop('TenYearCHD', axis = 1)
    y = df['TenYearCHD']

    # Perform SMOTE oversampling
    smote = SMOTE(random_state=42)  # random_state=42, sampling_strategy=0.5)
    # smote = SMOTE(random_state=42, sampling_strategy=0.5)


    X_smote, y_smote = smote.fit_resample(X, y)

    # Convert the oversampled feature set and target vector back into a DataFrame
    X_smote_df = pd.DataFrame(X_smote, columns=X.columns)
    y_smote_df = pd.DataFrame(y_smote, columns=['TenYearCHD'])

    # Re-join the features and the target into a single DataFrame
    oversampled_df = pd.concat([X_smote_df, y_smote_df], axis=1)

    # Save the re-joined, oversampled dataset to the specified OutputPath
    oversampled_df.to_csv(smote_output_path, index=False)

#  Train a model

In [None]:
@component(packages_to_install=["pandas", "scikit-learn", "joblib"])
def train_model(training_dataset_path: InputPath('Dataset'),
                              trained_model_artifact: OutputPath('Model')):

    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    # from sklearn.linear_model import LogisticRegression
    import joblib
    import os

    # Load the training data
    train_df = pd.read_csv(training_dataset_path)

    X_train = train_df.drop('TenYearCHD', axis=1)
    y_train = train_df['TenYearCHD']

    trained_model = RandomForestClassifier(random_state=42)
    trained_model.fit(X_train, y_train)

    joblib.dump(trained_model, trained_model_artifact)
    # Save the model to the designated gcs output path
    # os.makedirs(trained_model_artifact.path, exist_ok=True)
    # joblib.dump(trained_model, os.path.join(trained_model_artifact.path, "model.joblib"))

#  Evaluate model

In [None]:
from kfp.v2.dsl import Metrics

@component(packages_to_install=["pandas", "scikit-learn", "joblib", "xgboost==1.6.2" ])
def evaluate_model(test_dataset_path: InputPath('Dataset'),
                   model: InputPath('Model'),
                   metrics: Output[Metrics]):

    import pandas as pd
    import joblib
    from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, average_precision_score

    # Load the test dataset
    test_df = pd.read_csv(test_dataset_path)
    X_test = test_df.drop(columns=['TenYearCHD'])
    y_test = test_df['TenYearCHD']

    # Load the trained model
    trained_model = joblib.load(model)

    # Make predictions
    y_pred = trained_model.predict(X_test)

    # Calculate the confusion matrix and extract components
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    auc_prc = average_precision_score(y_test, y_pred)

    # Log each component of the confusion matrix separately
    metrics.log_metric("accuracy", accuracy)
    metrics.log_metric("f1_score", f1)
    metrics.log_metric("true_negatives", int(tn))
    metrics.log_metric("false_positives", int(fp))
    metrics.log_metric("false_negatives", int(fn))
    metrics.log_metric("true_positives", int(tp))
    metrics.log_metric("auc_prc", auc_prc)



# Define pipeline

In [None]:
from kfp.v2.dsl import pipeline, Output, Dataset

@pipeline(name='fp-pipeline')
def fp_pipeline(training_dataset_path:str):

    # Process training dataset - initial data preparation
    training_data_preparation = perform_initial_data_preparation(input_dataset_path=training_dataset_path)

    #Process One-hot encoding
    one_hot_encoding = onehot_encoding(dataset_path=training_data_preparation.outputs['output_dataset_path'])

    # Process Split
    data_split = split_dataset(dataset_path=one_hot_encoding.outputs['output_path'])

    #Outlier training set
    outlier_training_result = outlier_training(training_dataset_path=data_split.outputs['training_dataset_path'])

    #Outlier validation set
    outlier_validation_result = outlier_validation(validation_dataset_path=data_split.outputs['validation_dataset_path'],
                                                   iqr_values=outlier_training_result.outputs['iqr_values'])

    # Impute training dataset
    training_data = impute_training(training_dataset_path=outlier_training_result.outputs['training_outlier_output_path'])

    # Impute validation dataset
    validation_data = impute_validation(validation_dataset_path=outlier_validation_result.outputs['validation_outlier_output_path'],
                                        imputed_values=training_data.outputs['imputed_values'] )

    #Normalise training set
    normalised_training_result = normalise_training(training_dataset_path=training_data.outputs['imputed_dataset_path'])

    #Normalise validaton set
    normalised_validation_result = normalise_validation(validation_dataset_path=validation_data.outputs['imputed_dataset_path'],
                                                        scaler_path=normalised_training_result.outputs['scaler_path'])


    # Perform SMOTE oversampling on the training partition
    oversampled_training_data = perform_SMOTE(imputed_training_path=normalised_training_result.outputs['normalised_training_dataset_path'])

    # Train a Random Forest model
    trained_model =  train_model(training_dataset_path=oversampled_training_data.outputs['smote_output_path'])


    # Evaluate the model
    evaluate_model(
        test_dataset_path=normalised_validation_result.outputs['normalised_validation_dataset_path'],
        model=trained_model.outputs['trained_model_artifact']
    )


#  Compile and run pipeline

In [None]:
REGION = 'us-central1'
BUCKET_URI = "gs://finalproject_ise543"

In [None]:
SERVICE_ACCOUNT = "753516815850-compute@developer.gserviceaccount.com"

In [None]:
!gsutil iam ch serviceAccount : {SERVICE_ACCOUNT}: roles/storage.objectCreator $BUCKET_URI
!gsutil iam ch serviceAccount : {SERVICE_ACCOUNT}: roles/storage.objectViewer $BUCKET_URI

CommandException: Must specify a role to grant.
CommandException: Must specify a role to grant.


In [None]:
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=fp_pipeline,
    package_path = 'fp_pipeline.json'
)

pipeline_job = aiplatform.PipelineJob(
    display_name='fp_modeling_pipeline',
    template_path='fp_pipeline.json',
    pipeline_root='gs://finalproject_ise543',
    parameter_values={
      'training_dataset_path': 'gs://finalproject_ise543/Final Project Dataset.csv'
    },
    enable_caching=True
)

pipeline_job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/753516815850/locations/us-central1/pipelineJobs/fp-pipeline-20240502161707
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/753516815850/locations/us-central1/pipelineJobs/fp-pipeline-20240502161707')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/fp-pipeline-20240502161707?project=753516815850
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/753516815850/locations/us-central1/pipelineJobs/fp-pipeline-20240502161707 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob run completed. Resource name: projects/753516815850/locations/us-central1/pipe

In [None]:
# import pandas as pd
# path = "gs://finalproject_ise543/753516815850/fp-pipeline-20240420210851/normalise-training_8827496792284200960/normalised_training_dataset_path"
# df = pd.read_csv(path)
# df.info()

In [None]:
# print(df.isnull().sum())