In [1]:
import pandas as pd
from typing import Text
from absl import logging

from tfx.orchestration import metadata, pipeline
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner

from sklearn.preprocessing import LabelEncoder

import os
import shutil
from zipfile import ZipFile

import warnings

warnings.filterwarnings("ignore")

In [2]:
#!/bin/bash
!kaggle datasets download iammustafatz/diabetes-prediction-dataset

Dataset URL: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset
License(s): copyright-authors
diabetes-prediction-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
zip_file = "./diabetes-prediction-dataset.zip"
data_dir = "./data"
archive_dir = "./archive"
os.makedirs(data_dir, exist_ok=True)
with ZipFile(zip_file, "r") as zip_ref:
    zip_ref.extractall()

In [4]:
import os
import shutil

# Pastikan direktori ada sebelum digunakan
if not os.path.exists(archive_dir):
    print(f"Direktori {archive_dir} tidak ditemukan.")
else:
    list_data = os.listdir(archive_dir)

    for file in list_data:
        shutil.move(os.path.join(archive_dir, file), os.path.join(data_dir, file))

    os.removedirs(archive_dir)


Direktori ./archive tidak ditemukan.


In [5]:
import os
print(os.listdir("."))  # Melihat isi direktori saat ini
print(os.listdir("./data"))  # Melihat isi folder data


['.venv', 'cleaned_data.csv', 'data', 'diabetes-prediction-dataset.zip', 'diabetes_prediction_dataset.csv', 'main_pipeline.ipynb', 'modules']
['diabetes_prediction_dataset.csv']


In [6]:
import os
import shutil

data_dir = "./data"
csv_file = "diabetes_prediction_dataset.csv"

# Pastikan direktori tujuan ada
os.makedirs(data_dir, exist_ok=True)

# Pastikan file CSV ada sebelum dipindahkan
if os.path.exists(csv_file):
    shutil.move(csv_file, os.path.join(data_dir, csv_file))
    print(f"File {csv_file} berhasil dipindahkan ke {data_dir}.")
else:
    print(f"File {csv_file} tidak ditemukan.")


File diabetes_prediction_dataset.csv berhasil dipindahkan ke ./data.


In [7]:
import os

data_dir = "./data"

# Ambil daftar file dalam folder data
list_data = os.listdir(data_dir)
print("Isi folder data:", list_data)  # Debugging: lihat isi folder data

# Pastikan hanya ada file CSV yang dipilih
csv_files = [file for file in list_data if file.endswith(".csv")]

if len(csv_files) == 0:
    print("Tidak ada file CSV di dalam folder data.")
elif len(csv_files) == 1:
    data_file = os.path.join(data_dir, csv_files[0])  # Gunakan file satu-satunya
    print(f"File dataset yang digunakan: {data_file}")
else:
    # Jika ada lebih dari satu file CSV, hapus salah satu jika perlu
    print("Lebih dari satu file CSV ditemukan. Menggunakan file pertama.")
    os.remove(os.path.join(data_dir, csv_files[0]))  # Hapus yang pertama
    data_file = os.path.join(data_dir, csv_files[1])  # Ambil yang kedua
    print(f"File dataset yang digunakan: {data_file}")


Isi folder data: ['diabetes_prediction_dataset.csv']
File dataset yang digunakan: ./data\diabetes_prediction_dataset.csv


In [8]:
df = pd.read_csv(data_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [9]:
df.to_csv("cleaned_data.csv", index=False, encoding="utf-8")


In [10]:
PIPELINE_NAME = "diabetes_pipeline"

DATA_ROOT = data_dir
TRANSFORM_MODULE_FILE = "./modules/transform.py"
TRAINER_MODULE_FILE = "./modules/trainer.py"

OUTPUT_ROOT = "output"
SERVING_MODEL_DIR = os.path.join(OUTPUT_ROOT, "serving_model")
PIPELINE_ROOT = os.path.join(OUTPUT_ROOT, PIPELINE_NAME)
METADATA = os.path.join(PIPELINE_ROOT, "metadata.sqlite")


def init_local_pipeline(
    components,
    pipeline_root: Text,
) -> pipeline.Pipeline:
    logging.info("Pipeline root set to: %s", pipeline_root)
    beam_args = [
        "--direct_running_mode=multi_processing",
        "--direct_num_workers=0",
    ]

    return pipeline.Pipeline(
        pipeline_name=PIPELINE_NAME,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(METADATA),
        eam_pipeline_args=beam_args,
    )


if __name__ == "__main__":
    logging.set_verbosity(logging.INFO)

    from modules.components import init_components

    component = init_components(
        data_dir=DATA_ROOT,
        transform_module=TRANSFORM_MODULE_FILE,
        training_module=TRAINER_MODULE_FILE,
        training_steps=500,
        eval_steps=200,
        serving_model_dir=SERVING_MODEL_DIR,
    )

    pipelines = init_local_pipeline(
        component,
        PIPELINE_ROOT,
    )
    BeamDagRunner().run(pipelines)

INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Pipeline root set to: output\diabetes_pipeline
INFO:absl:Generating ephemeral wheel package for 'd:\\DiabetesMLOps\\modules\\transform.py' (including modules: ['components', 'trainer', 'transform']).
INFO:absl:User module package has hash fingerprint version 4316396f5a9e7cb2b9cb005bc7b200a04b9f1694b0e109d4b5f9dcf9ec5a9980.
INFO:absl:Executing: ['d:\\DiabetesMLOps\\.venv\\Scripts\\python.exe', 'C:\\Users\\hrahm\\AppData\\Local\\Temp\\tmp9j8q4g9t\\_tfx_generated_setup.py', 'bdist_wheel', '--bdist-dir', 'C:\\Users\\hrahm\\AppData\\Local\\Temp\\tmpky3bvfoc', '--dist-dir', 'C:\\Users\\hrahm\\AppData\\Local\\Temp\\tmpa_fevnww']
INFO:absl:Successfully built user code wheel distribution at 'output\\diabetes_pipeline\\_wheels\\tfx_user_code_Transform-0.0+4316396f5a9e7cb2b9cb005bc7b200a04b9f169

INFO:absl:Node CsvExampleGen depends on [].
INFO:absl:Node CsvExampleGen is scheduled.
INFO:absl:Node Latest_blessed_model_resolver depends on [].
INFO:absl:Node Latest_blessed_model_resolver is scheduled.
INFO:absl:Node StatisticsGen depends on ['Run[CsvExampleGen]'].
INFO:absl:Node StatisticsGen is scheduled.
INFO:absl:Node SchemaGen depends on ['Run[StatisticsGen]'].
INFO:absl:Node SchemaGen is scheduled.
INFO:absl:Node ExampleValidator depends on ['Run[SchemaGen]', 'Run[StatisticsGen]'].
INFO:absl:Node ExampleValidator is scheduled.
INFO:absl:Node Transform depends on ['Run[CsvExampleGen]', 'Run[SchemaGen]'].
INFO:absl:Node Transform is scheduled.
INFO:absl:Node Trainer depends on ['Run[SchemaGen]', 'Run[Transform]'].
INFO:absl:Node Trainer is scheduled.
INFO:absl:Node Evaluator depends on ['Run[CsvExampleGen]', 'Run[Latest_blessed_model_resolver]', 'Run[Trainer]'].
INFO:absl:Node Evaluator is scheduled.
INFO:absl:Node Pusher depends on ['Run[Evaluator]', 'Run[Trainer]'].
INFO:absl

Instructions for updating:
Use ref() instead.


Instructions for updating:
Use ref() instead.
INFO:absl:Feature HbA1c_level has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature age has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature blood_glucose_level has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature bmi has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature diabetes has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature gender has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature heart_disease has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature hypertension has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:Feature smoking_history has a shape dim {
  size: 1
}
. Setting to DenseTensor.
INFO:absl:If the number of unique tokens is smaller than the provided top_k or approximation error is acceptable, consider using tft.experimental.approximate_vocabulary for a potentially mo

INFO:tensorflow:Assets written to: output\diabetes_pipeline\Transform\transform_graph\6\.temp_path\tftransform_tmp\e10846351ca1446b9cf5d12d5fa25eb8\assets


INFO:tensorflow:Assets written to: output\diabetes_pipeline\Transform\transform_graph\6\.temp_path\tftransform_tmp\e10846351ca1446b9cf5d12d5fa25eb8\assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
INFO:absl:If the number of unique tokens is smaller than the provided top_k or approximation error is acceptable, consider using tft.experimental.approximate_vocabulary for a potentially more efficient implementation.
INFO:absl:If the number of unique tokens is smaller than the provided top_k or approximation error is acceptable, consider using tft.experimental.approximate_vocabulary for a potentially more efficient implementation.


INFO:tensorflow:Assets written to: output\diabetes_pipeline\Transform\transform_graph\6\.temp_path\tftransform_tmp\952f33c3db9344ea89e3bef76f9d807a\assets


INFO:tensorflow:Assets written to: output\diabetes_pipeline\Transform\transform_graph\6\.temp_path\tftransform_tmp\952f33c3db9344ea89e3bef76f9d807a\assets
INFO:absl:If the number of unique tokens is smaller than the provided top_k or approximation error is acceptable, consider using tft.experimental.approximate_vocabulary for a potentially more efficient implementation.
INFO:absl:If the number of unique tokens is smaller than the provided top_k or approximation error is acceptable, consider using tft.experimental.approximate_vocabulary for a potentially more efficient implementation.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 6 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'pre_transform_schema': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Transform\\pre_transform_schema\\6"
, artifact_type: name: "Schema"
)], 'transform_graph': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Transform\\transform_graph\\6"
, artifact_type: name: "TransformGraph"
)], 'post_transform_schema': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Transform\\post_transform_schema\\6"
, artifact_type: name: "Schema"
)], 'pre_transform_stats': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Transform\\pre_transform_stats\\6"
, artifact_type: name: "ExampleStatistics"
properties {
  key: "span"
  value: INT
}
properties {
  key: "split_names"
  value: STRING
}
base_type: STATISTICS
)], 'updated_analyzer_cache': [Artifac

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 gender_xf (InputLayer)         [(None, 3)]          0           []                               
                                                                                                  
 smoking_history_xf (InputLayer  [(None, 4)]         0           []                               
 )                                                                                                
                                                                                                  
 age_xf (InputLayer)            [(None, 1)]          0           []                               
                                                                                                  
 hypertension_xf (InputLayer)   [(None, 1)]          0           []                           

INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: output\diabetes_pipeline\Trainer\model\7\Format-Serving\assets


INFO:tensorflow:Assets written to: output\diabetes_pipeline\Trainer\model\7\Format-Serving\assets


You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


INFO:absl:Training complete. Model written to output\diabetes_pipeline\Trainer\model\7\Format-Serving. ModelRun written to output\diabetes_pipeline\Trainer\model_run\7
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 7 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'model': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Trainer\\model\\7"
, artifact_type: name: "Model"
base_type: MODEL
)], 'model_run': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Trainer\\model_run\\7"
, artifact_type: name: "ModelRun"
)]}) for execution 7
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:node Trainer is finished.
INFO:absl:node Evaluator is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.components.evaluator.component.Evaluator"
    base_type: EVALUATE
  }
  id: "Evaluator"
}
contexts {
  contexts {
    type {
      name: "pipeline"
    }
    name {




INFO:absl:The 'example_splits' parameter is not set, using 'eval' split.
INFO:absl:Evaluating model.
INFO:absl:udf_utils.get_fn {'eval_config': '{\n  "metrics_specs": [\n    {\n      "metrics": [\n        {\n          "class_name": "AUC"\n        },\n        {\n          "class_name": "Precision"\n        },\n        {\n          "class_name": "Recall"\n        },\n        {\n          "class_name": "ExampleCount"\n        },\n        {\n          "class_name": "BinaryAccuracy",\n          "threshold": {\n            "change_threshold": {\n              "absolute": 0.0001,\n              "direction": "HIGHER_IS_BETTER"\n            },\n            "value_threshold": {\n              "lower_bound": 0.5\n            }\n          }\n        }\n      ]\n    }\n  ],\n  "model_specs": [\n    {\n      "label_key": "diabetes"\n    }\n  ],\n  "slicing_specs": [\n    {},\n    {\n      "feature_keys": [\n        "gender",\n        "heart_disease"\n      ]\n    }\n  ]\n}', 'fairness_indicator_thre



























INFO:absl:Evaluation complete. Results written to output\diabetes_pipeline\Evaluator\evaluation\8.
INFO:absl:Checking validation results.


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
INFO:absl:Blessing result True written to output\diabetes_pipeline\Evaluator\blessing\8.
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 8 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'blessing': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Evaluator\\blessing\\8"
, artifact_type: name: "ModelBlessing"
)], 'evaluation': [Artifact(artifact: uri: "output\\diabetes_pipeline\\Evaluator\\evaluation\\8"
, artifact_type: name: "ModelEvaluation"
)]}) for execution 8
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:node Evaluator is finished.
INFO:absl:node Pusher is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.components.pusher.component.Pusher"
    base_type: DEPLOY
  }
  id: "Pusher"
}
contexts {
  contexts {
    type {
      name: "pipeline"
    }
    nam