In [None]:
!pip install datasets[s3] "torch==1.11" "transformers==4.21.0" "sentencepiece==0.1.96"

In [None]:
!pip install sagemaker --upgrade

In [3]:
import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
session_bucket = sagemaker_session.default_bucket()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {session_bucket}")
print(f"sagemaker session region: {sagemaker_session.boto_region_name}")

sagemaker role arn: arn:aws:iam::279578104300:role/service-role/AmazonSageMaker-ExecutionRole-20220729T235326
sagemaker bucket: sagemaker-us-east-1-279578104300
sagemaker session region: us-east-1


In [4]:
from sagemaker.s3 import S3Downloader
import os

if ("wikihowAll.csv" not in os.listdir()):
    S3Downloader.download(s3_uri="s3://sagemaker-us-east-1-279578104300/yubaba/dataset/wikihow/all/wikihowAll.csv",
                      local_path=".",
                      sagemaker_session=sagemaker_session)
    print("csv downloaded")

csv downloaded


In [5]:
import yaml

with open("./configs/wikihow_t5.yaml", "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
config

{'dataset': {'name': 'wikihow', 'portion': 0.01, 'data_dir': '.'},
 'tokenizer': {'max_length': 1024},
 'model': {'name': 't5_base', 'checkpoint': None},
 'train': {'model_path': '',
  'checkpoint_path': '',
  'num_epochs': 1,
  'learning_rate': 0.0003,
  'weight_decay': 0.001,
  'eps': 1e-08,
  'batch_size': 2,
  'gradient_accum_steps': 8},
 'eval': None}

In [None]:
import torch

from datasets import load_dataset
from wrapper.wikihow import Wikihow
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small', model_max_length = config["tokenizer"]["max_length"])

def tokenize(batch):
    inputs = batch['text']
    inputs = inputs.strip().replace("\n","")
        
    labels = batch['headline']

    inputs = tokenizer.batch_encode_plus([inputs], truncation = True, padding = "max_length", return_tensors = "pt")
    targets = tokenizer.batch_encode_plus([labels], truncation = True, padding = "max_length", return_tensors = "pt")
        
    return {"source_ids": inputs["input_ids"].squeeze(), 
            "source_mask": inputs["attention_mask"].squeeze(),
            "target_ids": targets["input_ids"].squeeze(),
            "target_mask": targets["attention_mask"].squeeze()}

train_dataset, validation_dataset, test_dataset = load_dataset("wikihow", "all", data_dir=".", split=["train", "validation", "test"])
train_dataset = train_dataset.map(tokenize, batched=False)
validatation_dataset = validation_dataset.map(tokenize, batched=False)
test_dataset = test_dataset.map(tokenize, batched=False)
train_dataset.set_format('torch')
validation_dataset.set_format('torch')
test_dataset.set_format('torch')

(train_dataset, validation_dataset, test_dataset)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()
train_input_path = f's3://{session_bucket}/yubaba/dataset/train'
validation_input_path = f's3://{session_bucket}/yubaba/dataset/validation'
test_input_path = f's3://{session_bucket}/yubaba/dataset/test'

train_dataset.save_to_disk(train_input_path, fs=s3)
validation_dataset.save_to_disk(validation_input_path, fs=s3)
test_dataset.save_to_disk(test_input_path, fs=s3)

In [6]:
from sagemaker.pytorch import PyTorch

train_input_path = f's3://{session_bucket}/yubaba/dataset/train'
validation_input_path = f's3://{session_bucket}/yubaba/dataset/validation'
test_input_path = f's3://{session_bucket}/yubaba/dataset/test'

hyperparameters = {
    "num_epochs": 1,
    "learning_rate": 0.0003,
    "weight_decay": 0.001,
    "eps": 0.00000001,
    "gradient_accum_steps": 8,
    "batch_size": 2,
}

estimator = PyTorch(entry_point="entry.py",
                    source_dir="./src",
                    role=role, 
                    py_version="py38",
                    framework_version="1.11.0",
                    instance_count=1,
                    instance_type='ml.m5.xlarge',
                    hyperparameters=hyperparameters)

In [None]:
estimator.fit({"train": train_input_path, "test": test_input_path})