<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/BERT/End%20to%20End%20BERT%20using%20TFX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram

### Setup

In [None]:
# !pip install tfx
# !pip install transformers

### Connecting to Kaggle

In [2]:
from google.colab import files

files.upload()

! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
!kaggle competitions download -c sentiment-analysis-on-movie-reviews
!unzip /content/sentiment-analysis-on-movie-reviews.zip

In [None]:
!unzip /content/test.tsv.zip
!unzip /content/train.tsv.zip

### Importing Dependencies

In [58]:
import os
import pprint
import numpy as np
import tempfile
import urllib

import absl
import pandas as pd
import tensorflow as tf
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

import tfx
from tfx.components import CsvExampleGen
from typing import Dict, List, Text
from tfx.components import Evaluator
from tfx.components import Pusher
# from tfx.components import ResolverNode

from tfx.components import Trainer
from tfx.components import Transform
from tfx.components.base import executor_spec
from tfx.components.trainer.executor import GenericExecutor
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
# from tfx.utils.dsl_utils import external_input


from tfx.proto import example_gen_pb2
from tfx.components import ImportExampleGen

from tfx.components import StatisticsGen
import tensorflow_data_validation as tfdv
from tfx.components import SchemaGen
from tfx.components import ExampleValidator
from tfx.components import Transform

from sklearn.model_selection import train_test_split

%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip

The tfx.orchestration.experimental.interactive.notebook_extensions.skip extension is already loaded. To reload it, use:
  %reload_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip


### Loading Dataset

In [20]:
df = pd.read_csv("/content/train.tsv", sep = "\t")

In [21]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [22]:
df = df.drop_duplicates(subset=['SentenceId'], keep='first')

In [24]:
df.to_csv("data.csv", index=False)

### Setup Pipeline Paths

In [55]:
! cd /content/
os.makedirs("/content/tfx/", exist_ok=True)
os.makedirs("/content/tfx/pipelines", exist_ok=True)
os.makedirs("/content/tfx/metadata", exist_ok=True)
os.makedirs("/content/tfx/logs", exist_ok=True)
os.makedirs("/content/tfx/data", exist_ok=True)
os.makedirs("/content/tfx/serving_model", exist_ok=True)

In [56]:
_tfx_root = os.path.join(os.getcwd(), 'tfx');        # Create location ~/tfx
_pipeline_root = os.path.join(_tfx_root, 'pipelines');      # Join ~/tfx/pipelines/
_metadata_db_root = os.path.join(_tfx_root, 'metadata.db');    # Join ~/tfx/metadata.db
_log_root = os.path.join(_tfx_root, 'logs');
_model_root = os.path.join(_tfx_root, 'model');
_data_root = os.path.join(_tfx_root, 'data');
_serving_model_dir = os.path.join(_tfx_root, 'serving_model')
_data_filepath = os.path.join(_data_root, "data_trans.csv")

_input_fn_module_file = 'inputfn_trainer.py'
_constants_module_file = 'constants_trainer.py'
_model_trainer_module_file = 'model_trainer.py'

In [57]:
df.columns

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [60]:
train, test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

In [64]:
train.shape

(7676, 4)

In [65]:
df = pd.read_csv('/content/data.csv')

##Drop useless columns
df.drop(['PhraseId', 'SentenceId',], axis=1, inplace=True)

#Drop NA rows
df.dropna(inplace=True)

##Keep a test set for final testing( TFX internally splits train and validation data )
traindf, testdf = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)


print(len(traindf))
print(len(testdf))

traindf.to_csv("/content/tfx/data/data_trans.csv", index=False, header=True)
testdf.to_csv("test.csv", index=False, header=False)

7676
853


In [68]:
!head {_data_filepath}

Phrase,Sentiment
"With the dog days of August upon us , think of this dog of a movie as the cinematic equivalent of high humidity .",1
"One of the smarter offerings the horror genre has produced in recent memory , even if it 's far tamer than advertised .",3
A poignant comedy that offers food for thought .,4
Completely awful Iranian drama ... as much fun as a grouchy ayatollah in a cold mosque .,0
"The film is enriched by an imaginatively mixed cast of antic spirits , headed by Christopher Plummer as the subtlest and most complexly evil Uncle Ralph I 've ever seen in the many film and stage adaptations of the work .",4
Beautifully reclaiming the story of Carmen and recreating it an in an African idiom .,3
Kinnear ... gives his best screen performance with an oddly winning portrayal of one of life 's ultimate losers .,4
Perhaps the film should be seen as a conversation starter .,2
"It 's far from a frothy piece , and the characters are complex , laden with plenty of baggage and tinged w

### Data Ingestion

In [30]:
input_config = example_gen_pb2.Input(splits=[
     example_gen_pb2.Input.Split(name='train', pattern='train/*'),
     example_gen_pb2.Input.Split(name='val', pattern='val/*'),
     example_gen_pb2.Input.Split(name='test', pattern='test/*')
 ])

data_root = "/content/data.csv"

example_gen = ImportExampleGen(
     input_base=data_root, input_config=input_config)

In [31]:
example_gen

ImportExampleGen(spec: <tfx.types.standard_component_specs.FileBasedExampleGenSpec object at 0x7f056cc4ee10>, executor_spec: <tfx.dsl.components.base.executor_spec.BeamExecutorSpec object at 0x7f056cc4ead0>, driver_class: <class 'tfx.components.example_gen.driver.FileBasedDriver'>, component_id: ImportExampleGen, inputs: {}, outputs: {'examples': OutputChannel(artifact_type=Examples, producer_component_id=ImportExampleGen, output_key=examples, additional_properties={}, additional_custom_properties={})})

### Data Validation

In [33]:
statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

### Schmea

In [38]:
schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True)

In [40]:
schema_gen

SchemaGen(spec: <tfx.types.standard_component_specs.SchemaGenSpec object at 0x7f056ba48b50>, executor_spec: <tfx.dsl.components.base.executor_spec.ExecutorClassSpec object at 0x7f056cbef950>, driver_class: <class 'tfx.dsl.components.base.base_driver.BaseDriver'>, component_id: SchemaGen, inputs: {'statistics': OutputChannel(artifact_type=ExampleStatistics, producer_component_id=StatisticsGen, output_key=statistics, additional_properties={}, additional_custom_properties={})}, outputs: {'schema': OutputChannel(artifact_type=Schema, producer_component_id=SchemaGen, output_key=schema, additional_properties={}, additional_custom_properties={})})

In [42]:
example_validator = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=schema_gen.outputs['schema'])

In [None]:
transform = Transform(
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema'],
      module_file=module_file)