# Prepare data

In [1]:
# Read and randomize data
import numpy as np
import torch
import sagemaker
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

data = np.loadtxt("train.txt", delimiter=',')
perm_idx = np.random.permutation(data.shape[0])
vali_num = int(data.shape[0] * 0.2)
vali_idx = perm_idx[:vali_num]
train_idx = perm_idx[vali_num:]

# Split into training and validation data
train_data = data[train_idx]
vali_data = data[vali_idx]

# Seperate features and labels
train_features = train_data[:, 1:].astype(np.float32)
train_labels = train_data[:, 0].astype(int)
vali_features = vali_data[:, 1:].astype(np.float32)
vali_labels = vali_data[:, 0].astype(int)

# Create NN model

In [2]:
# define a Dataset class
class HandNumDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx, :], self.labels[idx]


train_data = HandNumDataset(train_features, train_labels)
vali_data = HandNumDataset(vali_features, vali_labels)
batch_size = 64

# Create data loaders
train_dataloader = DataLoader(train_data, batch_size=batch_size)
vali_dataloader = DataLoader(vali_data, batch_size=batch_size)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")


# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


model = NeuralNetwork().to(device)

Using cpu device


# Train model
## Chosen parameters:
Cross Entropy Loss Function, ReLU Activation Function, SGD with weight decay of 0.01, NN layers 768 -> 512 -> 1024 -> 512 -> 10, 7 Epoches.

In [3]:
# loss function and optimizer
lossFn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)


# train data set
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, "
          f"Avg loss: {test_loss:>8f} \n")


epochs = 7
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, lossFn, optimizer)
    test(vali_dataloader, model, lossFn)
print("Done!")

Epoch 1
-------------------------------
loss: 5.612870  [   64/48000]
loss: 0.432412  [ 6464/48000]
loss: 0.278971  [12864/48000]
loss: 0.117172  [19264/48000]
loss: 0.181404  [25664/48000]
loss: 0.213302  [32064/48000]
loss: 0.073927  [38464/48000]
loss: 0.205687  [44864/48000]
Test Error: 
 Accuracy: 95.5%, Avg loss: 0.141408 

Epoch 2
-------------------------------
loss: 0.194915  [   64/48000]
loss: 0.152005  [ 6464/48000]
loss: 0.160896  [12864/48000]
loss: 0.049595  [19264/48000]
loss: 0.050290  [25664/48000]
loss: 0.126600  [32064/48000]
loss: 0.033021  [38464/48000]
loss: 0.149567  [44864/48000]
Test Error: 
 Accuracy: 96.5%, Avg loss: 0.112933 

Epoch 3
-------------------------------
loss: 0.067337  [   64/48000]
loss: 0.088288  [ 6464/48000]
loss: 0.062640  [12864/48000]
loss: 0.006879  [19264/48000]
loss: 0.013696  [25664/48000]
loss: 0.093956  [32064/48000]
loss: 0.012176  [38464/48000]
loss: 0.065042  [44864/48000]
Test Error: 
 Accuracy: 96.9%, Avg loss: 0.104731 

Epoc

# Optimize Parameters

In [4]:
# # Model 1
# class NeuralNetwork1(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork1, self).__init__()
#         self.flatten = nn.Flatten()
#         self.linear_relu_stack = nn.Sequential(
#             nn.Linear(28*28, 512),
#             nn.ReLU(),
#             nn.Linear(512, 512),
#             nn.ReLU(),
#             nn.Linear(512, 10)
#         )

#     def forward(self, x):
#         x = self.flatten(x)
#         logits = self.linear_relu_stack(x)
#         return logits


# # Model 2
# class NeuralNetwork2(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork2, self).__init__()
#         self.flatten = nn.Flatten()
#         self.linear_relu_stack = nn.Sequential(
#             nn.Linear(28*28, 512),
#             nn.ReLU(),
#             nn.Linear(512, 768),
#             nn.ReLU(),
#             nn.Linear(768, 512),
#             nn.ReLU(),
#             nn.Linear(512, 10)
#         )

#     def forward(self, x):
#         x = self.flatten(x)
#         logits = self.linear_relu_stack(x)
#         return logits


# # Model 4
# class NeuralNetwork3(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork3, self).__init__()
#         self.flatten = nn.Flatten()
#         self.linear_relu_stack = nn.Sequential(
#             nn.Linear(28*28, 512),
#             nn.ReLU(),
#             nn.Linear(512, 1024),
#             nn.ReLU(),
#             nn.Linear(1024, 512),
#             nn.ReLU(),
#             nn.Linear(512, 10)
#         )

#     def forward(self, x):
#         x = self.flatten(x)
#         logits = self.linear_relu_stack(x)
#         return logits


# # Model 4
# class NeuralNetwork4(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork4, self).__init__()
#         self.flatten = nn.Flatten()
#         self.linear_relu_stack = nn.Sequential(
#             nn.Linear(28*28, 1024),
#             nn.ReLU(),
#             nn.Linear(1024, 1024),
#             nn.ReLU(),
#             nn.Linear(1024, 512),
#             nn.ReLU(),
#             nn.Linear(512, 10)
#         )

#     def forward(self, x):
#         x = self.flatten(x)
#         logits = self.linear_relu_stack(x)
#         return logits

# epoches = [5, 7, 10, 13]
# wDecay = [1e-1, 1e-2, 1e-3, 1e-4]

## Convert to ONNX model from pytorch

In [5]:
# test_features = np.loadtxt("test.txt", delimiter=',')
# print('array of testing feature matrix: shape ' + str(np.shape(test_features)))

# model.eval()
# x = torch.randn(batch_size, 1, 28, 28)
# torch_out = torch_model(x)




# # model_scripted = torch.jit.script(model)
# # model_scripted.save("model.pt")

# # model1 = torch.jit.load("model.pt")

# # raw_pred = model1(torch.tensor(test_features).to(device).float())
# # pred = np.argmax(raw_pred.to('cpu').detach().numpy(), axis=1)

# print(pred)

In [6]:
from sagemaker.pytorch import PyTorch


role = "arn:aws:iam::981842671259:role/service-role/SageMaker-Piyush"

data_path = "s3://holobolo-sagemaker-bucket/data/train.txt"

pytorch_estimator = PyTorch(entry_point="guesser.py",
                            instance_type="ml.c5.2xlarge",
                            instance_count=1,
                            role=role,
                            framework_version="1.8.0",
                            py_version="py3",
                            hyperparameters={"epochs": 10, "batch-size": 64,
                                             "learning-rate": 1e-2})


pytorch_estimator.fit({"train":
                       "s3://holobolo-sagemaker-bucket/data"})

NOTEBOOK_METADATA_FILE detected but failed to get valid domain and user from it.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: pytorch-training-2023-10-25-18-51-18-148


2023-10-25 18:51:18 Starting - Starting the training job...
2023-10-25 18:51:35 Starting - Preparing the instances for training......
2023-10-25 18:52:29 Downloading - Downloading input data...
2023-10-25 18:53:05 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-10-25 18:53:18,306 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-10-25 18:53:18,308 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-10-25 18:53:18,317 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-10-25 18:53:18,319 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-10-25 18:53:18,484 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m


In [7]:
predictor = pytorch_estimator.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-981842671259/pytorch-training-2023-10-25-18-51-18-148/output/model.tar.gz), script artifact (s3://sagemaker-us-east-1-981842671259/pytorch-training-2023-10-25-18-51-18-148/source/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-981842671259/pytorch-training-2023-10-25-18-54-31-412/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-training-2023-10-25-18-54-31-412
INFO:sagemaker:Creating endpoint-config with name pytorch-training-2023-10-25-18-54-31-412
INFO:sagemaker:Creating endpoint with name pytorch-training-2023-10-25-18-54-31-412


-------!

In [10]:
data = np.loadtxt("train.txt", delimiter=',')
device = "cuda" if torch.cuda.is_available() else "cpu"
test = data[0]
test_features = test[1:]
test_features = np.array([test_features])

print(test_features)
response = predictor.predict(list(test_features.astype(float)))
pred = np.argmax(response)
print(pred)

[[  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   3.  18.
   18.  18. 126. 136. 175.  26. 166. 255. 247. 127.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.  30.  36.  94. 154. 170. 253.
  253. 253. 253. 253. 225. 172. 253. 242. 195.  64.   0.   0.   