In [1]:
!pip install pandas
!pip install boto3
!pip install transformers==4.18.0
!pip install torch

import os
import boto3
import s3fs
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from transformers import MobileBertTokenizer
import sagemaker
from sagemaker.pytorch import PyTorch
from datetime import datetime




In [2]:

# -----CHANGE THESE, NOTHING ELSE ---------

train_data = 'data/bal_train.csv'
model_name = 'bal_model1'

# -----------------------------------------

# Initialize a session
s3 = boto3.client('s3', region_name='ap-southeast-2')
bucket_name = 'sdg-project'

# Paths
split_path = f'models/{model_name}/Split_Data/'
artifact_path = f'models/{model_name}/artifacts/'
s3_path = f's3://sdg-project/models/{model_name}/'
local_base_path = f'Models/{model_name}/Split_Data/'

# Make sure S3 directories exist
s3.put_object(Bucket=bucket_name, Key=split_path)
s3.put_object(Bucket=bucket_name, Key=artifact_path)

# Role and session for SageMaker
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()


In [3]:
#Load data
csv_obj = s3.get_object(Bucket=bucket_name, Key=train_data)
csv_string = csv_obj['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(csv_string))

print(len(df))
print(df.head())

4626
                                                Text  SDG 1  SDG 2  SDG 3  \
0  Australian Women in Security Network (AWSN) Re...      0      0      0   
1  Mental Health and Wellbeing During the COVID-1...      0      0      1   
2  ALCA Policy Roadmap Australian Land Conservati...      0      0      0   
3  Go hydrogen! Promoting Japanese awareness of A...      0      0      0   
4  Agent models of tsunami evacuation behaviour t...      0      0      0   

   SDG 4  SDG 5  SDG 6  SDG 7  SDG 8  SDG 9  SDG 10  SDG 11  SDG 12  SDG 13  \
0      0      1      0      0      1      0       0       0       0       0   
1      0      0      0      0      1      0       0       0       0       0   
2      0      0      0      0      0      0       0       0       0       0   
3      0      0      0      1      0      0       0       0       0       0   
4      1      0      0      0      0      0       0       0       0       0   

   SDG 14  SDG 15  SDG 16  SDG 17  
0       0       0    

In [4]:
# Remove rows where 'Text' is not of type str
valid_rows = df['Text'].apply(lambda x: isinstance(x, str))
df = df[valid_rows]

print(len(df))

4626


In [5]:
# Extract features and labels
X = df['Text']
y = df.drop(['Text'], axis=1)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
#Ensure directory exists
fs = s3fs.S3FileSystem()
os.makedirs(local_base_path, exist_ok=True)


# Save locally first
X_train.to_csv(f'Models/{model_name}/Split_Data/X_train.csv', index=False)
y_train.to_csv(f'Models/{model_name}/Split_Data/y_train.csv', index=False)
X_val.to_csv(f'Models/{model_name}/Split_Data/X_val.csv', index=False)
y_val.to_csv(f'Models/{model_name}/Split_Data/y_val.csv', index=False)

# Define the local paths
local_base_path = f'Models/{model_name}/Split_Data/'

# Upload to S3
with fs.open(f'{s3_path}Split_Data/X_train.csv', 'wb') as f:
    f.write(open(f'{local_base_path}X_train.csv', 'rb').read())
    
with fs.open(f'{s3_path}Split_Data/y_train.csv', 'wb') as f:
    f.write(open(f'{local_base_path}y_train.csv', 'rb').read())
    
with fs.open(f'{s3_path}Split_Data/X_val.csv', 'wb') as f:
    f.write(open(f'{local_base_path}X_val.csv', 'rb').read())
    
with fs.open(f'{s3_path}Split_Data/y_val.csv', 'wb') as f:
    f.write(open(f'{local_base_path}y_val.csv', 'rb').read())


# Optional, remove local files
# os.remove('X_train.csv')
# os.remove('y_train.csv')
# os.remove('X_val.csv')
# os.remove('y_val.csv')


In [7]:
print(X_train.apply(type).value_counts())

<class 'str'>    3469
Name: Text, dtype: int64


In [8]:
from transformers import MobileBertTokenizer

#Initialise the MobileBERT tokenizer
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased', do_lower_case=True)

#Function to tokenise text
def tokenize_texts(texts):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        return_attention_mask=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

#Tokenise the training and val data
tokenized_train = tokenize_texts(X_train.tolist())
tokenized_val = tokenize_texts(X_val.tolist())


In [9]:
# #Create unique name for output based on the current timestamp
# current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# output_name = f"LLM_model_{current_time}"
# print(output_name)


#Define the output path for saving the model artifacts
output_path = f's3://sdg-project/models/{model_name}/artifacts'

#Initialise the PyTorch estimator. 
#Note: train.py is saved locally in SageMaker instance along with the .txt file
estimator = PyTorch(entry_point='train.py',
                    dependencies=['requirements.txt'],
                    role=role,
                    framework_version='1.8.0',
                    py_version='py3',
                    instance_count=1,
                    instance_type='ml.m5.4xlarge',
                    output_path=output_path,
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'epochs': 30,
                        'learning_rate': 1e-4, 
                        'batch_size': 16,
                        'step_size': 10,
                        'patience' : 4,
                        'gamma': 0.5 
                    })


NOTEBOOK_METADATA_FILE detected but failed to get valid domain and user from it.


In [None]:
# Define locations of the training and val datasets
train_input_X = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{split_path}X_train.csv', content_type='csv')
train_input_y = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{split_path}y_train.csv', content_type='csv')
val_input_X = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{split_path}X_val.csv', content_type='csv')
val_input_y = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{split_path}y_val.csv', content_type='csv')

# Start training job
estimator.fit({'train_X': train_input_X, 'train_y': train_input_y, 'val_X': val_input_X, 'val_y': val_input_y})


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-10-26-04-05-28-024


Using provided s3_resource
2023-10-26 04:05:28 Starting - Starting the training job...
2023-10-26 04:05:42 Starting - Preparing the instances for training...
2023-10-26 04:06:27 Downloading - Downloading input data...
2023-10-26 04:06:47 Training - Downloading the training image...
2023-10-26 04:07:23 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-10-26 04:07:47,641 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-10-26 04:07:47,643 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-10-26 04:07:47,653 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-10-26 04:07:47,655 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-10-26 04:07:47,

[34mTransformers module imported successfully![0m
[34mtransformers Version: 4.18.0[0m
[34m[2023-10-26 04:08:40.902 algo-1:40 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-10-26 04:08:41.048 algo-1:40 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2023-10-26 04:08:41.048 algo-1:40 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2023-10-26 04:08:41.048 algo-1:40 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2023-10-26 04:08:41.049 algo-1:40 INFO hook.py:253] Saving to /opt/ml/output/tensors[0m
[34m[2023-10-26 04:08:41.049 algo-1:40 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2023-10-26 04:08:41.068 algo-1:40 INFO hook.py:584] name:mobilebert.embeddings.word_embeddings.weight count_params:3906816[0m
[34m[2023-10-2

[34mEpoch 1, Step 0, Loss: 4176701.75[0m
[34mEpoch 1, Step 16, Loss: 0.8615745902061462[0m
[34mEpoch 1, Step 32, Loss: 0.9807386994361877[0m
[34mEpoch 1, Step 48, Loss: 0.9193110466003418[0m
[34mEpoch 1, Step 64, Loss: 0.928370475769043[0m
[34mEpoch 1, Step 80, Loss: 0.9111267924308777[0m
[34mEpoch 1, Step 96, Loss: 0.8141156435012817[0m
[34mEpoch 1, Step 112, Loss: 0.796457827091217[0m
[34mEpoch 1, Step 128, Loss: 0.6257028579711914[0m
[34mEpoch 1, Step 144, Loss: 0.6169062852859497[0m
[34mEpoch 1, Step 160, Loss: 0.5396451950073242[0m
[34mEpoch 1, Step 176, Loss: 0.44501709938049316[0m
[34mEpoch 1, Step 192, Loss: 0.39699482917785645[0m
[34mEpoch 1, Step 208, Loss: 0.39240801334381104[0m
[34mEpoch 1/30, Average Loss: 48083.28546241489, Time taken: 802.81 seconds[0m
[34mValidation Loss after Epoch 1/30: 0.3918631378918478[0m
[34mEpoch 2, Step 0, Loss: 0.400378942489624[0m
[34mEpoch 2, Step 16, Loss: 0.43489447236061096[0m
[34mEpoch 2, Step 32, Loss

[34mEpoch 2/30, Average Loss: 0.37883031903873393, Time taken: 787.59 seconds[0m
[34mValidation Loss after Epoch 2/30: 0.3546633834708227[0m
[34mEpoch 3, Step 0, Loss: 0.3728722035884857[0m
[34mEpoch 3, Step 16, Loss: 0.38793879747390747[0m
[34mEpoch 3, Step 32, Loss: 0.4109104871749878[0m
[34mEpoch 3, Step 48, Loss: 0.3123508095741272[0m
[34mEpoch 3, Step 64, Loss: 0.3133867383003235[0m
[34mEpoch 3, Step 80, Loss: 0.2816030979156494[0m
[34mEpoch 3, Step 96, Loss: 0.28599223494529724[0m
[34mEpoch 3, Step 112, Loss: 0.33625268936157227[0m
[34mEpoch 3, Step 128, Loss: 0.31584498286247253[0m
