In [1]:
import sagemaker
import json
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
prefix = 'sagemaker-studio-book/chapter06'

In [2]:
!wget -q https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz
!tar -xzf dbpedia_csv.tar.gz

tar: dbpedia_csv/test.csv: Cannot change ownership to uid 1000, gid 1000: Operation not permitted
tar: dbpedia_csv/classes.txt: Cannot change ownership to uid 1000, gid 1000: Operation not permitted
tar: dbpedia_csv/train.csv: Cannot change ownership to uid 1000, gid 1000: Operation not permitted
tar: dbpedia_csv/readme.txt: Cannot change ownership to uid 1000, gid 1000: Operation not permitted
tar: dbpedia_csv: Cannot change ownership to uid 1000, gid 1000: Operation not permitted
tar: Exiting with failure status due to previous errors


In [2]:
!head dbpedia_csv/train.csv -n 3

1,"E. D. Abbott Ltd"," Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972."
1,"Schwan-Stabilo"," Schwan-STABILO is a German maker of pens for writing colouring and cosmetics as well as markers and highlighters for office use. It is the world's largest manufacturer of highlighter pens Stabilo Boss."
1,"Q-workshop"," Q-workshop is a Polish company located in Poznań that specializes in designand production of polyhedral dice and dice accessories for use in various games (role-playing gamesboard games and tabletop wargames). They also run an online retail store and maintainan active forum community.Q-workshop was established in 2001 by Patryk Strzelewicz – a student from Poznań. Initiallythe company sold its products via online auction services but in 2005 a website and online store wereestablishe

In [4]:
!grep -i "automatic electric"  dbpedia_csv/train.csv 

1,"Automatic Electric"," Automatic Electric Company (AE) was the largest of the manufacturing units of the Automatic Electric Group. It was a telephone equipment supplier for independent telephone companies in North America and also had a world-wide presence. With its line of automatic telephone exchanges it was also a long-term supplier of switching equipment to the Bell System starting in 1919."
1,"Tokyo Marui"," Tokyo Marui Co. Ltd (株式会社東京マルイ Kabushiki-gaisha Tōkyō Marui) is an airsoft gun manufacturer located in Adachi Tokyo Japan. They are best known for creating the AEG(Automatic electric gun). Its main market is Japan but third-party retailers sell in Hong Kong (PRC) Taiwan (ROC) South Korea East Asia and worldwide. Such is the popularity of its guns that the company has its own center for airsoft sport called Tokyo Marui BB Sports Field."


In [5]:
!cat dbpedia_csv/classes.txt

Company
EducationalInstitution
Artist
Athlete
OfficeHolder
MeanOfTransportation
Building
NaturalPlace
Village
Animal
Plant
Album
Film
WrittenWork


In [6]:
d_label = {}
with open('dbpedia_csv/classes.txt') as f:
    for i, label in enumerate(f.readlines()):
        d_label[str(i + 1)] = label.strip()
print(d_label)

{'1': 'Company', '2': 'EducationalInstitution', '3': 'Artist', '4': 'Athlete', '5': 'OfficeHolder', '6': 'MeanOfTransportation', '7': 'Building', '8': 'NaturalPlace', '9': 'Village', '10': 'Animal', '11': 'Plant', '12': 'Album', '13': 'Film', '14': 'WrittenWork'}


In [7]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def transform_text(row):
    cur_row = []
    label = f'__label__{d_label[row[0]]}'  # Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower()))
    cur_row.extend(nltk.word_tokenize(row[2].lower()))
    return cur_row

In [9]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv

def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[: int(keep * len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_text, all_rows)
    pool.close()
    pool.join()

    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [9]:
%%time

# Preparing the training dataset

# Since processing the whole dataset may take a long time,
# we only use 20% of the training dataset for this example.
# Set keep=1 if you want to use the whole dataset
preprocess('dbpedia_csv/train.csv', 'dbpedia_csv/dbpedia.train', keep=0.2)

# Preparing the validation dataset
preprocess('dbpedia_csv/test.csv', 'dbpedia_csv/dbpedia.validation')

CPU times: user 12.9 s, sys: 1.15 s, total: 14.1 s
Wall time: 1min 29s


In [10]:
!head -n 1 dbpedia_csv/dbpedia.train

__label__Company automatic electric automatic electric company ( ae ) was the largest of the manufacturing units of the automatic electric group . it was a telephone equipment supplier for independent telephone companies in north america and also had a world-wide presence . with its line of automatic telephone exchanges it was also a long-term supplier of switching equipment to the bell system starting in 1919 .


In [11]:
s3_output_location = f's3://{bucket}/{prefix}/output'

In [12]:
image=sagemaker.image_uris.retrieve(framework='blazingtext', 
                                    region=region, 
                                    version='1')
print(image)

433757028032.dkr.ecr.us-west-2.amazonaws.com/blazingtext:1


In [14]:
estimator = sagemaker.estimator.Estimator(
            image,
            role,
            instance_count=1,
            instance_type='ml.c4.4xlarge',
            volume_size=30,
            max_run=360000,
            input_mode='File',
            enable_sagemaker_metrics=True,
            output_path=s3_output_location,
            hyperparameters={
                'mode': 'supervised',
                'epochs': 20,
                'min_count': 2,
                'learning_rate': 0.05,
                'vector_dim': 10,
                'early_stopping': True,
                'patience': 4,
                'min_epochs': 5,
                'word_ngrams': 2,
            },
)

In [15]:
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='dbpedia_csv/dbpedia.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='dbpedia_csv/dbpedia.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = f's3://{bucket}/{train_channel}'
s3_validation_data = f's3://{bucket}/{validation_channel}'
print(s3_train_data)
print(s3_validation_data)

data_channels = {'train': s3_train_data, 
                 'validation': s3_validation_data}

estimator.fit(inputs=data_channels,
              job_name=jobname,
              logs=True)

s3://sagemaker-us-west-2-552106442228/sagemaker-studio-book/chapter06/train
s3://sagemaker-us-west-2-552106442228/sagemaker-studio-book/chapter06/validation


In [19]:
estimator.model_data

's3://sagemaker-us-west-2-552106442228/sagemaker-studio-book/chapter06/output/dbpedia-blazingtext-2021-09-20-22-40-18/output/model.tar.gz'

In [22]:
!aws s3 cp {estimator.model_data} ./dbpedia_csv/

download: s3://sagemaker-us-west-2-552106442228/sagemaker-studio-book/chapter06/output/dbpedia-blazingtext-2021-09-20-22-40-18/output/model.tar.gz to dbpedia_csv/model.tar.gz


In [23]:
%%sh
cd dbpedia_csv/
tar -zxf model.tar.gz

In [24]:
pwd

'/root/Machine-Learning-Development-with-Amazon-SageMaker-Studio/chapter06'

In [26]:
%%sh
wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
unzip v0.9.2.zip
cd fastText-0.9.2
make 

Archive:  v0.9.2.zip
5b5943c118b0ec5fb9cd8d20587de2b2d3966dfe
   creating: fastText-0.9.2/
   creating: fastText-0.9.2/.circleci/
  inflating: fastText-0.9.2/.circleci/cmake_test.sh  
  inflating: fastText-0.9.2/.circleci/config.yml  
  inflating: fastText-0.9.2/.circleci/gcc_test.sh  
  inflating: fastText-0.9.2/.circleci/pip_test.sh  
  inflating: fastText-0.9.2/.circleci/pull_data.sh  
  inflating: fastText-0.9.2/.circleci/python_test.sh  
  inflating: fastText-0.9.2/.circleci/run_locally.sh  
  inflating: fastText-0.9.2/.circleci/setup_circleimg.sh  
  inflating: fastText-0.9.2/.circleci/setup_debian.sh  
  inflating: fastText-0.9.2/.gitignore  
  inflating: fastText-0.9.2/CMakeLists.txt  
  inflating: fastText-0.9.2/CODE_OF_CONDUCT.md  
  inflating: fastText-0.9.2/CONTRIBUTING.md  
  inflating: fastText-0.9.2/LICENSE  
  inflating: fastText-0.9.2/MANIFEST.in  
  inflating: fastText-0.9.2/Makefile  
  inflating: fastText-0.9.2/README.md  
   creating: fastText-0.9.2/alignment/
  in

--2021-09-20 23:55:23--  https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/facebookresearch/fastText/zip/v0.9.2 [following]
--2021-09-20 23:55:23--  https://codeload.github.com/facebookresearch/fastText/zip/v0.9.2
Resolving codeload.github.com (codeload.github.com)... 192.30.255.121
Connecting to codeload.github.com (codeload.github.com)|192.30.255.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘v0.9.2.zip’

     0K .......... .......... .......... .......... .......... 8.65M
    50K .......... .......... .......... .......... .......... 4.99M
   100K .......... .......... .......... .......... .......... 53.0M
   150K .......... .......... .......... .......... .......... 50.8M
   200K ......

CalledProcessError: Command 'b'wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip\nunzip v0.9.2.zip\ncd fastText-0.9.2\nmake \n'' returned non-zero exit status 127.

In [25]:
!fasttest predict dbpedia_csv/model.bin dbpedia_csv/dbpedia.validation

/bin/sh: 1: fasttest: not found


In [13]:
!pip -q install sagemaker-experiments

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [23]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from botocore.exceptions import ClientError
from time import gmtime, strftime
import time

experiment_name = 'dbpedia-text-classification'

try:
    experiment = Experiment.create(
        experiment_name=experiment_name, 
        description='Training a text classification model using dbpedia dataset.')
except ClientError as e:
    print(f'{experiment_name} experiment already exists! Reusing the existing experiment.')

In [24]:
for lr in [0.1, 0.01, 0.001]:
    
    exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
    jobname = f'dbpedia-blazingtext-{exp_datetime}'

    # Creating a new trial for the experiment
    exp_trial = Trial.create(
        experiment_name=experiment_name, 
        trial_name=jobname
    )

    experiment_config={
        'ExperimentName': experiment_name,
        'TrialName': exp_trial.trial_name,
        'TrialComponentDisplayName': 'Training',

    }
    estimator = sagemaker.estimator.Estimator(
                    image,
                    role,
                    instance_count=1,
                    instance_type='ml.c4.4xlarge',
                    volume_size=30,
                    max_run=360000,
                    input_mode='File',
                    enable_sagemaker_metrics=True,
                    output_path=s3_output_location,
                    hyperparameters={
                        'mode': 'supervised',
                        'epochs': 40,
                        'min_count': 2,
                        'learning_rate': lr,
                        'vector_dim': 10,
                        'early_stopping': True,
                        'patience': 4,
                        'min_epochs': 5,
                        'word_ngrams': 2,
                    },
    )
    
    estimator.fit(
             inputs=data_channels,
             job_name=jobname,
             experiment_config=experiment_config,
             wait=False)
    print(f'Submitted training job {jobname}')

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: dbpedia-blazingtext-2021-09-26-22-50-04


Submitted training job dbpedia-blazingtext-2021-09-26-22-50-04


INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: dbpedia-blazingtext-2021-09-26-22-50-05


Submitted training job dbpedia-blazingtext-2021-09-26-22-50-05


INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: dbpedia-blazingtext-2021-09-26-22-50-15


Submitted training job dbpedia-blazingtext-2021-09-26-22-50-15
