This notebook is developed using the `Python 3 (Data Science)` kernel on an `ml.t3.medium` instance.

In [None]:
import sagemaker
import json
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
prefix = 'sagemaker-studio-book/chapter05'

In [None]:
from time import gmtime, strftime
import time

In [None]:
!wget -q https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz
!tar -xzf dbpedia_csv.tar.gz

In [None]:
!head dbpedia_csv/train.csv -n 3

In [None]:
!grep -i "automatic electric"  dbpedia_csv/train.csv 

In [None]:
!cat dbpedia_csv/classes.txt

In [None]:
d_label = {}
with open('dbpedia_csv/classes.txt') as f:
    for i, label in enumerate(f.readlines()):
        d_label[str(i + 1)] = label.strip()
print(d_label)

In [None]:
import nltk

nltk.download('punkt')

In [None]:
def transform_text(row):
    cur_row = []
    label = f'__label__{d_label[row[0]]}'  # Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower()))
    cur_row.extend(nltk.word_tokenize(row[2].lower()))
    return cur_row

In [None]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv

def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[: int(keep * len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_text, all_rows)
    pool.close()
    pool.join()

    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [None]:
%%time
preprocess('dbpedia_csv/train.csv', 'dbpedia_csv/dbpedia.train', keep=0.2)
preprocess('dbpedia_csv/test.csv', 'dbpedia_csv/dbpedia.validation')

In [None]:
!head -n 1 dbpedia_csv/dbpedia.train

In [None]:
image=sagemaker.image_uris.retrieve(framework='blazingtext', 
                                    region=region, 
                                    version='1')
print(image)

In [None]:
s3_output_location = f's3://{bucket}/{prefix}/output'

estimator = sagemaker.estimator.Estimator(
            image,
            role,
            instance_count=1,
            instance_type='ml.c5.2xlarge',
            volume_size=30,
            max_run=360000,
            input_mode='File',
            enable_sagemaker_metrics=True,
            output_path=s3_output_location,
            hyperparameters={
                'mode': 'supervised',
                'epochs': 20,
                'min_count': 2,
                'learning_rate': 0.05,
                'vector_dim': 10,
                'early_stopping': True,
                'patience': 4,
                'min_epochs': 5,
                'word_ngrams': 2,
            },
)

In [None]:
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='dbpedia_csv/dbpedia.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='dbpedia_csv/dbpedia.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = f's3://{bucket}/{train_channel}'
s3_validation_data = f's3://{bucket}/{validation_channel}'
print(s3_train_data)
print(s3_validation_data)

data_channels = {'train': s3_train_data, 
                 'validation': s3_validation_data}

exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'dbpedia-blazingtext-{exp_datetime}'

estimator.fit(inputs=data_channels,
              job_name=jobname,
              logs=True)

In [None]:
estimator.model_data

In [None]:
!aws s3 cp {estimator.model_data} ./dbpedia_csv/

In [None]:
%%sh
cd dbpedia_csv/
tar -zxf model.tar.gz

-------

In [None]:
!pip install -q sagemaker-experiments

In [None]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from botocore.exceptions import ClientError

experiment_name = 'dbpedia-text-classification'

try:
    experiment = Experiment.create(
        experiment_name=experiment_name, 
        description='Training a text classification model using dbpedia dataset.')
except ClientError as e:
    print(f'{experiment_name} experiment already exists! Reusing the existing experiment.')

In [None]:
for lr in [0.1, 0.01, 0.001]:
    
    exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
    jobname = f'dbpedia-blazingtext-{exp_datetime}'

    # Creating a new trial for the experiment
    exp_trial = Trial.create(
        experiment_name=experiment_name, 
        trial_name=jobname)

    experiment_config={
        'ExperimentName': experiment_name,
        'TrialName': exp_trial.trial_name,
        'TrialComponentDisplayName': 'Training'}
    
    estimator = sagemaker.estimator.Estimator(
                    image,
                    role,
                    instance_count=1,
                    instance_type='ml.c5.2xlarge',
                    volume_size=30,
                    max_run=360000,
                    input_mode='File',
                    enable_sagemaker_metrics=True,
                    output_path=s3_output_location,
                    hyperparameters={
                        'mode': 'supervised',
                        'epochs': 40,
                        'min_count': 2,
                        'learning_rate': lr,
                        'vector_dim': 10,
                        'early_stopping': True,
                        'patience': 4,
                        'min_epochs': 5,
                        'word_ngrams': 2},
    )
    
    estimator.fit(
             inputs=data_channels,
             job_name=jobname,
             experiment_config=experiment_config,
             wait=False)
    print(f'Submitted training job {jobname}')