### Install the necessary packages

In [None]:
!pip install sagemaker
!pip install transformers
!pip install torch
!pip install datasets
!pip install sagemaker_training

In [3]:
import sagemaker
import datetime
import time
import tarfile
import boto3
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import Session


### Establishing connection between notebook and sagemaker and creating session

In [4]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
my_region = sess.boto_session.region_name
print(my_region)

us-east-1


### Creating S3 bucket

In [5]:
bucket_name = 'testcaseclassification1'

In [6]:
s3 = boto3.resource('s3')
try:
    if my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
        print(f"The bucket {bucket_name} successfully created")
except Exception as e:
    print('S3 error')
    

The bucket testcaseclassification1 successfully created


In [63]:
output_path='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://testcaseclassification1/distilbertmodel/output


### Reading the data

In [7]:
df=pd.read_excel("Deposit - Unified_output.xlsx")
df.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
selected_columns = ['Scenario Description', 'Function', 'Sub-Function', 'Feature']
df = df[selected_columns]
df.head()

Unnamed: 0,Scenario Description,Function,Sub-Function,Feature
0,Scenario to Validate one month Mudaraba depos...,DEPOSIT,Creation,Deposit Creation
1,Scenario to Validate one month Mudaraba depos...,DEPOSIT,Authorization,Deposit Authorization
2,Scenario to validate created one month Mudarab...,DEPOSIT,Enquiry,Accounting Entries
3,Scenario to Validate one USD month Mudaraba d...,DEPOSIT,Creation,Deposit Creation
4,Scenario to Validate one USD month Mudaraba d...,DEPOSIT,Authorization,Deposit Authorization


### Spliting Dataset into train and test dataset

In [9]:
train_data, test_data= np.split(df.sample(frac=1, random_state=1729), [int(0.7*len(df))])
print(train_data.shape, test_data.shape)

train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

(819, 4) (351, 4)


  return bound(*args, **kwds)


### Saving Train and Test dataset into S3 buckets

In [10]:
prefix = 'distilbertmodel'

In [11]:
trainpath = sess.upload_data(
    path="train.csv", bucket=bucket_name, key_prefix=f"{prefix}/train"
)

trainpath

's3://testcaseclassification1/distilbertmodel/train/train.csv'

In [12]:
testpath = sess.upload_data(
    path="test.csv", bucket=bucket_name, key_prefix=f"{prefix}/test"
)

testpath

's3://testcaseclassification1/distilbertmodel/test/test.csv'

### Create Training Script

In [13]:
%%writefile distilbert_script.py

import argparse
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
import joblib
import boto3


# Custom Dataset class
class ScenarioFunctionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=256  # Adjust based on your specific needs and resources
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

# Load the model
def model_fn(model_dir):
    model_path = os.path.join(model_dir, "model.joblib")
    model = joblib.load(model_path)
    
    # Load the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
    
    # Load the label encoder
    label_encoder_path = os.path.join(model_dir, "label_encoder.joblib")
    label_encoder = joblib.load(label_encoder_path)
    
    return model, tokenizer, label_encoder

def decode_predictions(predictions, label_encoder):
    decoded_predictions = label_encoder.inverse_transform(predictions)
    return decoded_predictions

def predict_fn(input_data, model_and_tokenizer):
    model, tokenizer, label_encoder = model_and_tokenizer
    model.eval()
    
    # Tokenize the input data
    encodings = tokenizer(
        input_data,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    
    # Perform prediction
    with torch.no_grad():
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = (logits > 0.0).float().cpu().numpy()
    
    # Decode the predictions
    decoded_predictions = decode_predictions(predictions, label_encoder)
    return decoded_predictions

if __name__ == "__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--learning_rate", type=float, default=5e-5)
    parser.add_argument("--train_batch_size", type=int, default=8)
    parser.add_argument("--eval_batch_size", type=int, default=8)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train.csv")
    parser.add_argument("--test-file", type=str, default="test.csv")

    args, _ = parser.parse_known_args()

    print("[INFO] Reading data")
    
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    X_train = train_df['Scenario Description'].astype(str)
    y_train = train_df[['Function', 'Sub-Function', 'Feature']]

    X_test = test_df['Scenario Description'].astype(str)
    y_test = test_df[['Function', 'Sub-Function', 'Feature']]

    all_classes = set(y_train['Function']).union(set(y_train['Sub-Function'])).union(set(y_train['Feature']))
    y_train_combined = y_train.apply(lambda x: tuple(all_classes & set(x)), axis=1)
    y_test_combined = y_test.apply(lambda x: tuple(all_classes & set(x)), axis=1)

    # Fit and transform the label encoder on your training data
    label_encoder = MultiLabelBinarizer(classes=list(all_classes))
    y_train_encoded = label_encoder.fit_transform(y_train_combined)
    y_test_encoded = label_encoder.fit_transform(y_test_combined)
    
    # Save the label encoder to a file
    label_encoder_path = os.path.join(args.model_dir, "label_encoder.joblib")
    joblib.dump(label_encoder, label_encoder_path)
    

    model_name = 'distilbert-base-uncased'
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    num_labels = len(all_classes)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    train_dataset = ScenarioFunctionDataset(X_train, y_train_encoded, tokenizer)
    test_dataset = ScenarioFunctionDataset(X_test, y_test_encoded, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    num_epochs = args.epochs
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        average_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}: Average training loss = {average_train_loss}")

    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = (logits > 0.0).float()

            all_predictions.append(predictions.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_predictions = np.concatenate(all_predictions, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    accuracy = accuracy_score(all_labels, all_predictions)

    print(f"Test Accuracy: {accuracy:.4f}")

    # Save the model using joblib
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)

    # Save the tokenizer
    tokenizer.save_pretrained(args.model_dir)

# Example input data
input_data = ["Example scenario description"]

# Make a prediction
prediction = predict_fn(input_data, model_fn(args.model_dir))
print("Prediction:", prediction)


Writing distilbert_script.py


In [None]:
'''
    # Decode and predict
    sample_texts = ["Sample scenario description 1", "Sample scenario description 2"]
    encodings = tokenizer(sample_texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
    encodings = {key: val.to(device) for key, val in encodings.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
        predictions = (logits > 0.0).float().cpu().numpy()

    decoded_predictions = decode_predictions(predictions, label_encoder)
    print("Predictions:", decoded_predictions)
    
    '''

In [70]:
! python distilbert_script.py --epochs 5 \
                              --learning_rate 5e-5 \
                              --train_batch_size 8 \
                              --eval_batch_size 8 \
                              --model-dir ./ \
                              --train ./ \
                              --test ./ \



[INFO] Extracting arguments
[INFO] Reading data
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: Average training loss = 0.2693900925297182
Epoch 2: Average training loss = 0.10753095309132511
Epoch 3: Average training loss = 0.08089174966788987
Epoch 4: Average training loss = 0.06017997576990752
Epoch 5: Average training loss = 0.047650689819773426
Test Accuracy: 0.4815
Model persisted at ./model.joblib


### Run model in sagemaker environment

In [32]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

# Get the execution role
role = get_execution_role()

# Define the Estimator
estimator = PyTorch(
    entry_point='distilbert_script.py',
    role=role,
    framework_version='1.12.0',  # specify the PyTorch version you want to use
    py_version='py38',
    instance_count=1,
    instance_type="ml.m5.large",  # choose an instance type with a GPU
    hyperparameters={
        'epochs': 5,
        'learning_rate': 5e-5,
        'train_batch_size': 8,
        'eval_batch_size': 8
    },
    dependencies=['requirements.txt']
    
)


# Launch the training job
estimator.fit({'train': trainpath, 'test': testpath})


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-471112636257
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-06-23-17-01-25-755


2024-06-23 17:01:26 Starting - Starting the training job...
2024-06-23 17:01:45 Starting - Preparing the instances for training...
2024-06-23 17:02:16 Downloading - Downloading input data......
2024-06-23 17:03:01 Downloading - Downloading the training image...
2024-06-23 17:03:52 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-06-23 17:03:54,883 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-06-23 17:03:54,886 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-06-23 17:03:54,894 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-06-23 17:03:54,896 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-06-23 17:03:55,068 sagemaker-training-

[34mEpoch 1: Average training loss = 0.27019670475455165[0m
[34mEpoch 2: Average training loss = 0.107440489299089[0m
[34mEpoch 3: Average training loss = 0.07753458365942668[0m
[34mEpoch 4: Average training loss = 0.05578676280249091[0m
[34mEpoch 5: Average training loss = 0.04634648577753201[0m

2024-06-23 17:19:10 Uploading - Uploading generated training model[34mTest Accuracy: 0.4815[0m
[34mModel persisted at /opt/ml/model/model.joblib[0m
[34mPrediction: [('Enquiry', 'DEPOSIT')][0m
[34m2024-06-23 17:19:04,999 sagemaker-training-toolkit INFO     Waiting for the process to finish and give a return code.[0m
[34m2024-06-23 17:19:05,000 sagemaker-training-toolkit INFO     Done waiting for a return code. Received 0 from exiting process.[0m
[34m2024-06-23 17:19:05,000 sagemaker-training-toolkit INFO     Reporting training SUCCESS[0m

2024-06-23 17:19:38 Completed - Training job completed
Training seconds: 1042
Billable seconds: 1042


In [33]:
model_data_path = estimator.model_data
print("Model data saved at:", model_data_path)

Model data saved at: s3://sagemaker-us-east-1-471112636257/pytorch-training-2024-06-23-17-01-25-755/output/model.tar.gz


In [45]:
from sagemaker.pytorch import PyTorchModel
from sagemaker import get_execution_role

# Assuming 'model_data' is the S3 location of your trained model artifacts
# Replace 'role' with your SageMaker execution role ARN
model = PyTorchModel(model_data=model_data_path,
                     role=get_execution_role(),
                     entry_point='distilbert_script.py',
                     framework_version='1.8.1',
                     py_version='py3')




In [46]:
# Deploy the model to an endpoint
predictor = model.deploy(initial_instance_count=1,
                         instance_type='ml.m5.xlarge',
                         endpoint_name='test1',
                         serializer=sagemaker.serializers.JSONSerializer(),
                         deserializer=sagemaker.deserializers.JSONDeserializer(),
                         # Increase timeout to 300 seconds (adjust as needed)
                         model_name='distilbert',
                         wait=True)

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-471112636257/pytorch-training-2024-06-23-17-01-25-755/output/model.tar.gz), script artifact (None), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-471112636257/pytorch-inference-2024-06-23-18-00-03-634/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-06-23-18-00-29-961
INFO:sagemaker:Creating endpoint-config with name test1
INFO:sagemaker:Creating endpoint with name test1


------!

In [47]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor(endpoint_name='test1',
                      sagemaker_session=sagemaker.Session(),
                      serializer=JSONSerializer(),
                      deserializer=JSONDeserializer())


In [49]:
# Describe the endpoint to get its status
endpoint_description = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)

print("Endpoint Status:", endpoint_description['EndpointStatus'])


Endpoint Status: InService


In [None]:
import json

input_data = ["Example scenario description"]

# Serialize input data to JSON
payload = json.dumps(input_data)

# Make prediction
response = predictor.predict(payload)

print("Prediction:", response)


In [None]:
import json

input_data = ["Example scenario description"]

# Serialize input data to JSON
payload = json.dumps(input_data)


try:
    # Make prediction
    response = predictor.predict(payload)
    print("Prediction:", response)

except Exception as e:
    print("Prediction error:", e)



In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# Assuming you have already deployed an endpoint named 'test'

predictor = Predictor(endpoint_name='test',
                      sagemaker_session=sagemaker.Session())

# Set up a JSON serializer and deserializer
predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

# Input data for prediction
input_data = ["Example scenario description"]

try:
    # Make prediction
    response = predictor.predict(input_data)

    print("Prediction:", response)

except Exception as e:
    print("Prediction error:", e)


In [None]:
sagemaker.Session().delete_endpoint(predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()