In [None]:
import sys
!{sys.executable} -m pip install "datasets==2.3.0" "transformers==4.20.1" "rouge-score"

In [None]:
# Run to load all the Libraries and helper functions
%run "Utilities/libraries.py"

In [None]:
# INPUTS TO THE SCRIPTS
Bucket_Name="model-deploy-poc"

train_test_val_location_S3="train_test_val"


text_column=text_column
target_column=target_column

In [None]:
model_names=["patrickvonplaten/bert2bert_cnn_daily_mail",
             'google/pegasus-xsum',
            "sshleifer/distilbart-cnn-12-6",
           'flax-community/t5-base-cnn-dm'
            ]

In [None]:
sess=boto3.session.Session()
role=sagemaker.get_execution_role()

## 1. Data Loading and Pre-processing

In [None]:
# load data from S3 to Data
# prefix="BBC News Summary"
# !aws s3 cp "s3://{Bucket_Name}/{prefix}" "./Data/" --recursive

In [None]:
# Read data
dataset_path = Path('Data/')

articles_data = list(map(extract, dataset_path.glob('News Articles/*/*.txt')))
summaries_data = list(map(extract, dataset_path.glob('Summaries/*/*.txt')))

articles_df = pd.DataFrame(articles_data, columns=('Category', 'ID', text_column))
summaries_df = pd.DataFrame(summaries_data, columns=('Category', 'ID', target_column))

df = articles_df.merge(summaries_df, how='inner', on=('Category', 'ID'))

In [None]:
articles_df.shape

## 2. Data Cleaning 

In [None]:
df[text_column][0]

In [None]:
# We need to do some data cleaning before checking stats
df[text_column]=data_clean(df[text_column])
df[target_column]=data_clean(df[target_column])
df.drop(columns=['ID','Category'],inplace=True)
print(data_clean(df[text_column][:5])[0])


In [None]:
print((df[text_column].str.len()).describe(percentiles=[.25, .5, .75,0.85]))
print("#############################")
print((df[target_column].str.len()).describe(percentiles=[.25, .5, .75,0.85]))

In [None]:
nth_decile_outlire_threshold=0.9

df=df[(df[text_column].str.len()<(df[text_column].str.len()).describe(percentiles=[nth_decile_outlire_threshold])[-2]) |
      (df[target_column].str.len()<(df[target_column].str.len()).describe(percentiles=[nth_decile_outlire_threshold])[-2])]

In [None]:
print((df[text_column].str.len()).describe(percentiles=[.25, .5, .75,0.85]))
print("#############################")
print((df[target_column].str.len()).describe(percentiles=[.25, .5, .75,0.85]))

## 3. Prepare data for the model

In [None]:
df_to_datadict=data_for_model_from_pandas(df,test_size=0.2,val_from_test_size=0.3)

In [None]:
df_to_datadict

In [None]:
s3 = filesystems.S3FileSystem()
df_to_datadict.save_to_disk(f"s3://{Bucket_Name}/{train_test_val_location_S3}/", fs=s3)

In [None]:
input_path = f"s3://{Bucket_Name}/{train_test_val_location_S3}"


## 4. Invoke Model Training

In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace

# Experiment with your models here. Change hyperparameters to optimize your results
def model_invoke(model_name,train_test_val_location_S3):
    output_dir_name=model_name.split("/")[-1]
    output_path = f"s3://{Bucket_Name}/model/{output_dir_name}"
    # gets role for executing training job
    role = sagemaker.get_execution_role()
    hyperparameters = {
        "model-name": model_name,
        "text-column": text_column,
        "target-column": target_column,
        "epoch": 5,
        'train-data-dir':input_path,
        'log-dir':"Logs",
#         'train-batch-size': 150,
#         'eval-batch-size': 5,
     # more info here https://github.com/huggingface/transformers/tree/v4.17.0/examples/pytorch/summarization
    }

    metric_definitions = [
        {"Name": "training:loss", "Regex": "'loss': (.*?),"},
        {"Name": "validation:loss", "Regex": "'eval_loss': (.*?),"},
        {"Name": "validation:rouge1", "Regex": "'eval_rouge1': (.*?),"},
        {"Name": "validation:rouge2", "Regex": "'eval_rouge2': (.*?),"},
        {"Name": "validation:rougeL", "Regex": "'eval_rougeL': (.*?),"},
        {"Name": "validation:rougeLsum", "Regex": "'eval_rougeLsum': (.*?),"},
        {"Name": "validation:gen_len", "Regex": "'eval_gen_len': (.*?),"},
    ]

    # git configuration to download our fine-tuning script
    # git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.17.0'}

# creates Hugging Face estimator
    huggingface_estimator = HuggingFace(
     entry_point='train.py',
     source_dir='Utilities',
     instance_type='ml.p3.2xlarge',
     base_job_name=model_name.split("/")[-1],   
     instance_count=1,
     role=role,
    #  git_config=git_config,
     transformers_version='4.17.0',
     pytorch_version='1.10.2',
     py_version='py38',
     hyperparameters = hyperparameters,
     output_path=output_path,
#  If operating on ml.p3.4xlarge and above we can opt for distributed computing to reduce training time.
    #  distribution ={"mpi": { "enabled": True },"smdistributed": {"modelparallel": { "enabled": True,"parameters": {}}}},
     metric_definitions=metric_definitions,
    )
    
    huggingface_estimator.fit({"train": f"s3://{Bucket_Name}/{train_test_val_location_S3}/train",
                           "test": f"s3://{Bucket_Name}/{train_test_val_location_S3}/val"})
    return(huggingface_estimator)
# starting the train job
# huggingface_estimator.fit()

In [None]:
model_dict={}
for model_name in model_names:
    print('\033[1m' + "#"*75)
    print('\033[1m' + model_name+": " +"Begin Process.........")
    print('\033[1m' + "#"*75 +'\033[0m')
    model_dict[model_name]=model_invoke(model_name=model_name,train_test_val_location_S3=train_test_val_location_S3)

In [None]:
# {j:model_dict[j].model_data for i,j in enumerate(model_dict)}

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

model_name = "distilbart-cnn-12-6"
model_location=f"s3://{Bucket_Name}/final_models/"+model_name+".tar.gz"

model_for_deployment = HuggingFaceModel(
    entry_point="inference.py",
    source_dir="Utilities",
    model_data=model_location,
    role=role,
    pytorch_version="1.7.1",
    py_version="py36",
    transformers_version="4.6.1",
    name=model_name.replace(r"_","-")+"-V2",
)

In [None]:
endpoint_name = "summarization-endpoint-5"+model_name+"-1"

predictor = model_for_deployment.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge",
    endpoint_name=endpoint_name.replace(r"_","-"),
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [None]:
predictor.predict({
'inputs': "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."
})

### Average first loading time 2 Seconds

In [None]:
# Can opt for Async inferencing for higher load
# from sagemaker.async_inference import AsyncInferenceConfig
# prefix="async-location"
# endpoint_name = "summarization-endpoint-5"+model_name+"-async-V1"
# # Create an empty AsyncInferenceConfig object to use default values
# async_config = AsyncInferenceConfig(output_path=f"s3://{Bucket_Name}/{prefix}/output")

# # deploy model to SageMaker Inference
# async_predictor = model_for_deployment.deploy(
#     async_inference_config=async_config,
#     initial_instance_count=1, # number of instances
#     instance_type='ml.g4dn.xlarge', # instance type,
#     serializer=sagemaker.serializers.JSONSerializer(),
#     deserializer=sagemaker.deserializers.JSONDeserializer(),
#     endpoint_name=endpoint_name.replace(r"_","-"),
# )

In [None]:
# async_predictor.predict_async({
# 'inputs': "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."
# })

In [None]:
# predictor.delete_endpoint()