# Final Project: Machine Learning System Development and Operation

- Name: Nicolas Debrito
- Email: nicolas.debrito66@gmail.com
- Id Dicoding: reezzy

## Import Library

In [65]:
import os
import pandas as pd
from typing import Text
from absl import logging
from tfx.orchestration import metadata, pipeline
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner
from sklearn.utils import resample
from modules.components import init_components

# Preparing the Dataset

In [66]:
df = pd.read_csv('data/train.csv')
df

Unnamed: 0,Text,Offensive
0,Hispanic neighborhoods are all the same; they'...,Yes
1,Non-vegetarians are promoting a culture that p...,Yes
2,Muslims are only interested in converting othe...,Yes
3,You must be good at social justice because you...,No
4,Vegans are too fanatical; they won't even atte...,Yes
...,...,...
6611,Why do Hispanics get offended by racial jokes?...,Yes
6612,You must have a preference for certain genres ...,Yes
6613,I shouldn't have to pay for ice; it's just fro...,No
6614,It's great that you're trying to lead a fulfil...,No


In [67]:
df['Offensive'] = df['Offensive'].apply(lambda x: 0 if x == 'No' else 1)
df.head()

Unnamed: 0,Text,Offensive
0,Hispanic neighborhoods are all the same; they'...,1
1,Non-vegetarians are promoting a culture that p...,1
2,Muslims are only interested in converting othe...,1
3,You must be good at social justice because you...,0
4,Vegans are too fanatical; they won't even atte...,1


In [68]:
df['Offensive'].value_counts()

1    5208
0    1408
Name: Offensive, dtype: int64

In [69]:
df_minor = df[df['Offensive'] == 0]
df_mayor = df[df['Offensive'] == 1]

In [70]:
df_upsampling = resample(df_minor, n_samples=len(df_mayor), random_state=42)
df = pd.concat([df_mayor, df_upsampling]).reset_index(drop=True)
df['Offensive'].value_counts()

1    5208
0    5208
Name: Offensive, dtype: int64

In [71]:
df.to_csv('fix_data/Fix_Data.csv', index=False)

## Run Pipeline

In [72]:
PIPELINE_NAME = "hate-speech-pipeline"

DATA_ROOT = "fix_data"
TRANSFORM_MODULE_FILE = "modules/hate_speech_transform.py"
TRAINER_MODULE_FILE = "modules/hate_speech_trainer.py"

OUTPUT_BASE = "reezzy-pipeline"
serving_model_dir = os.path.join(OUTPUT_BASE, 'serving_model')
pipeline_root = os.path.join(OUTPUT_BASE, PIPELINE_NAME)
metadata_path = os.path.join(pipeline_root, "metadata.sqlite")


def init_local_pipeline(
    components, pipeline_root: Text
) -> pipeline.Pipeline:
    
    logging.info(f"Pipeline root set to: {pipeline_root}")
    beam_args = [
        "--direct_running_mode=multi_processing",
        "----direct_num_workers=0" 
    ]
    
    return pipeline.Pipeline(
        pipeline_name=PIPELINE_NAME,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path
        ),
        beam_pipeline_args=beam_args
    )

if __name__ == "__main__":
    logging.set_verbosity(logging.INFO)
    
    components = init_components(
        DATA_ROOT,
        training_module=TRAINER_MODULE_FILE,
        transform_module=TRANSFORM_MODULE_FILE,
        serving_model_dir=serving_model_dir,
    )
    
    pipe = init_local_pipeline(components, pipeline_root)
    BeamDagRunner().run(pipeline=pipe)

INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Excluding no splits because exclude_splits is not set.
INFO:absl:Pipeline root set to: reezzy-pipeline\hate-speech-pipeline
INFO:absl:Generating ephemeral wheel package for 'c:\\submission-sistem-machine-learning\\modules\\hate_speech_transform.py' (including modules: ['components', 'hate_speech_trainer', 'hate_speech_transform']).
INFO:absl:User module package has hash fingerprint version b0c32c25d625505ef861ad996a66ff005c764a6d717270fc5f9f97dad1c99bfd.
INFO:absl:Executing: ['c:\\Users\\nicol\\AppData\\Local\\Programs\\Python\\Python38\\python.exe', 'C:\\Users\\nicol\\AppData\\Local\\Temp\\tmp5o9upr4i\\_tfx_generated_setup.py', 'bdist_wheel', '--bdist-dir', 'C:\\Users\\nicol\\AppData\\Local\\Temp\\tmpy433583b', '--dist-dir', 'C:\\Users\\nicol\\AppData\\Local\\Temp\\tmpb4fw95z6']
INFO:absl:Successfully built user code wheel distribution at 'reezzy-

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Text_xf (InputLayer)        [(None, 1)]               0         
                                                                 
 tf.reshape_8 (TFOpLambda)   (None,)                   0         
                                                                 
 text_vectorization_8 (TextV  (None, 50)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 50, 32)            1600000   
                                                                 
 global_average_pooling1d_8   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_32 (Dense)            (None, 128)               4224



INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets


INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets


Epoch 2/10
Epoch 2: val_binary_accuracy did not improve from 0.52062
Epoch 3/10
Epoch 3: val_binary_accuracy improved from 0.52062 to 0.52250, saving model to reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving




INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets


INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets


Epoch 4/10
Epoch 4: val_binary_accuracy did not improve from 0.52250
Epoch 5/10
Epoch 5: val_binary_accuracy did not improve from 0.52250
Epoch 6/10
Epoch 6: val_binary_accuracy did not improve from 0.52250
Epoch 7/10
Epoch 7: val_binary_accuracy did not improve from 0.52250
Epoch 8/10
Epoch 8: val_binary_accuracy improved from 0.52250 to 0.52312, saving model to reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving




INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets


INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets


Epoch 9/10
Epoch 9: val_binary_accuracy did not improve from 0.52312
Epoch 10/10
Epoch 10: val_binary_accuracy did not improve from 0.52312
INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets


INFO:tensorflow:Assets written to: reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving\assets
INFO:absl:Training complete. Model written to reezzy-pipeline\hate-speech-pipeline\Trainer\model\7\Format-Serving. ModelRun written to reezzy-pipeline\hate-speech-pipeline\Trainer\model_run\7
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 7 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'model_run': [Artifact(artifact: uri: "reezzy-pipeline\\hate-speech-pipeline\\Trainer\\model_run\\7"
, artifact_type: name: "ModelRun"
)], 'model': [Artifact(artifact: uri: "reezzy-pipeline\\hate-speech-pipeline\\Trainer\\model\\7"
, artifact_type: name: "Model"
base_type: MODEL
)]}) for execution 7
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:node Trainer is finished.
INFO:absl:node Evaluator is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.compone



INFO:absl:The 'example_splits' parameter is not set, using 'eval' split.
INFO:absl:Evaluating model.
INFO:absl:udf_utils.get_fn {'fairness_indicator_thresholds': 'null', 'example_splits': 'null', 'eval_config': '{\n  "metrics_specs": [\n    {\n      "metrics": [\n        {\n          "class_name": "ExampleCount"\n        },\n        {\n          "class_name": "AUC"\n        },\n        {\n          "class_name": "FalsePositives"\n        },\n        {\n          "class_name": "TruePositives"\n        },\n        {\n          "class_name": "FalseNegatives"\n        },\n        {\n          "class_name": "TrueNegatives"\n        },\n        {\n          "class_name": "BinaryAccuracy",\n          "threshold": {\n            "change_threshold": {\n              "absolute": 0.0001,\n              "direction": "HIGHER_IS_BETTER"\n            },\n            "value_threshold": {\n              "lower_bound": 0.5\n            }\n          }\n        }\n      ]\n    }\n  ],\n  "model_specs": [\



INFO:absl:Evaluation complete. Results written to reezzy-pipeline\hate-speech-pipeline\Evaluator\evaluation\8.
INFO:absl:Checking validation results.
INFO:absl:Blessing result True written to reezzy-pipeline\hate-speech-pipeline\Evaluator\blessing\8.
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 8 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'evaluation': [Artifact(artifact: uri: "reezzy-pipeline\\hate-speech-pipeline\\Evaluator\\evaluation\\8"
, artifact_type: name: "ModelEvaluation"
)], 'blessing': [Artifact(artifact: uri: "reezzy-pipeline\\hate-speech-pipeline\\Evaluator\\blessing\\8"
, artifact_type: name: "ModelBlessing"
)]}) for execution 8
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:node Evaluator is finished.
INFO:absl:node Pusher is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.components.pusher.component.Pusher"
    base_type