In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torchmetrics



In [3]:
#import libraries
import numpy as np
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [4]:
#Load the device GPU
gpu = torch.device("cuda:0")
print(torch.cuda.get_device_name(torch.cuda.current_device()))

Tesla T4


In [5]:
#load the dataset
import codecs
dataset_t = ""
with codecs.open('/content/drive/Othercomputers/Il mio computer/BarberoGenerator/dataset/dataset_barbero_sarzana.txt', encoding='utf-8') as f:
    for character in f:
        dataset_t = dataset_t + character

In [6]:
# transform dataset from numeric to one-hot
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
whole_dataset_list_of_chars = list(dataset_t)
dataset_int = pd.DataFrame(whole_dataset_list_of_chars, columns=['data'])

# create a dictionary
label_enc = LabelEncoder()
dictionary = dataset_int.drop_duplicates(subset=['data'])
dictionary['int_encoding'] = label_enc.fit_transform(dictionary['data'])
dataset_int['int_encoding'] = label_enc.fit_transform(dataset_int['data'])


# one hot encode
one_hot_enc = OneHotEncoder()
# lstm uses float32 insted of float64.
one_hot_encoded_dataset = one_hot_enc.fit_transform(dataset_int[['int_encoding']]).toarray().astype(np.float32)
one_hot_encoded_dataset = torch.from_numpy(one_hot_encoded_dataset)
one_hot_encoded_dataset = one_hot_encoded_dataset.to(gpu)
print(one_hot_encoded_dataset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dictionary['int_encoding'] = label_enc.fit_transform(dictionary['data'])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [7]:
print(dictionary)

        data  int_encoding
0          b            39
1          e            42
2          n            51
4                        1
6          u            58
...      ...           ...
1099082    -             4
1099618    U            33
1100751    È            64
1101278    N            26
1104174    V            34

[72 rows x 2 columns]


In [8]:
# check that the device set is the GPU
assert one_hot_encoded_dataset.get_device() == 0

In [9]:

# ADD LABELS: turn dataset from a string "s1,s2,...,si,si+1" to ((s1,...,sk),(s2,...,sk+1)),...
sequence_length = 100
dataset = []
for i in range(0,len(one_hot_encoded_dataset)-sequence_length,int(sequence_length/2)):
    x = []
    y = []
    for j in range(sequence_length):
        x.append(one_hot_encoded_dataset[i+j])
        y.append(one_hot_encoded_dataset[i+j+1])
    dataset.append((x, y))
print(len(dataset))
print(len(dataset[0]))

22736
2


In [10]:
# We use the whole dataset as training set, bc we will use k-fold cross-validation to validate and test the models
batch_size_train = 64
train_dataloader_kfold = DataLoader(dataset, batch_size=batch_size_train, shuffle=False)
iterator = iter(train_dataloader_kfold)
data, label = next(iterator)
a = torch.stack(data)
print(torch.permute(a, (1,0,2)).size())

torch.Size([64, 100, 72])


In [11]:
def print_number_of_parameters_of_model(model):
  n_parameters = 0
  for parameter in model.parameters():
      n_parameters = n_parameters + sum(list(parameter.size()))
  print("The model has %d parameters" % n_parameters)

## Simple Model: 32 hidden units, 1 LSTM layer
This model has 32 as the shape of the hidden units of LSTM. The total number of parameters is 792.\
The loss and Optimizer are respectively CrossEntropy and Adam.

In [12]:
# model_32: single LSTM with an hidden layer size of 32
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device):
        super().__init__()
        self.hidden_size = hidden_size
        self.device = device
        # weights and biases are initialized using the uniform distribution:
        #      weights.uniform_(-sqrt(output_size), +sqrt(output_size))
        self.lstm_layer = torch.nn.LSTMCell(input_size=input_size, hidden_size=hidden_size)
        self.dense = torch.nn.Linear(in_features=hidden_size, out_features=output_size)

    def forward(self, x):
        outputs = []
        # construct the initial hidden and cell states
        h_t = torch.zeros(x.size()[0],self.hidden_size).to(self.device)
        c_t = torch.zeros(x.size()[0],self.hidden_size).to(self.device)
        # the split function collapses the batch dimension, so now, instead of having (batch_size,sequence_len,input_size) inputs,
        # you have a number equal to batch_size of (batch_size,input_size) inputs; you will feed these inputs one after another to the lstm, using
        # the history h_t and short term memory c_t, along with the i-th input of type (batch_size,input_size)
        for time_step in torch.split(x, split_size_or_sections=1, dim=1):
            # this is needed to remove a dimension that wasn't removed by split
            time_step = torch.squeeze(time_step)
            h_t, c_t = self.lstm_layer(time_step, (h_t, c_t))
            output = self.dense(h_t)
            outputs.append(output)
        # len(outputs) = batch_size. We convert outputs to a tensor by concatenating all tensors inside it
        #outputs = torch.stack(outputs)
        #print(outputs.size())
        return outputs
input_size = 72
output_size = input_size
model_32 = NeuralNetwork(input_size, 32, output_size,gpu)
model_32.to(gpu)
print_number_of_parameters_of_model(model_32)

The model has 792 parameters


In [13]:
#declare loss, optimizer
loss_32 = torch.nn.CrossEntropyLoss()
optimizer_32 = torch.optim.Adam(model_32.parameters(), lr=0.001)

## 1024 hidden units, 1 LSTM
This model has 1024 as the shape of the hidden units of LSTM. The total number of parameters is 18648.
The loss and Optimizer are respectively CrossEntropy and Adam.

In [14]:
# model_1024: single LSTM with an hidden layer size of 1024
input_size = 72
output_size = input_size
model_1024 = NeuralNetwork(input_size, 1024, output_size, gpu)
model_1024.to(gpu)
print_number_of_parameters_of_model(model_1024)

The model has 18648 parameters


In [15]:
#declare loss, optimizer
loss_1024 = torch.nn.CrossEntropyLoss()
optimizer_1024 = torch.optim.Adam(model_1024.parameters(), lr=0.001)

## K-Fold Cross-Validation vs Train and Test
How can we evaluate the differences between two different models?
* We could train and test each of them by splitting the dataset 80/20, and confront metrics. But the split could be an unfair one.
* We can use K-Fold <strong>Cross Validation</strong>

<strong>Cross Validation</strong>: We subdivide the dataset into k GROUPS.
* one group is selected to be the test set
* the other groups constitutes the training set
* the model is trained and tested, and metrics are recorded

In the end we trained and tested k times the same model with different training and test sets, hence we are capable to say <strong> how good this model is, if it was to be trained with this dataset </strong>

After we decided which is the best model, we will <strong> use the whole dataset to train it </strong>

In [31]:
import time

#train
def train_cycle(EPOCHS, train_dataloader, model, loss, optimizer):
    start_time = time.time()
    losses = []
    for i in range(EPOCHS):
        iterator = iter(train_dataloader)
        print("Epoch: " + str(i))
        for data, label in iterator:
            # pytorch accumulate gradients at every batch, doing this you reset these gradients
            optimizer.zero_grad()
            # we convert a list of tensors (given as such by the dataloader) to a tensor
            data = torch.stack(data)
            # convert the tensor given in shape (sequence_len,batch_size,num_classes) to (batch_size,sequence_len,num_classes)
            data = torch.permute(data, (1,0,2))
            #forward step
            outputs = model(data)
            loss_acc = 0
            total_loss = 0
            for i in range(len(outputs)):
                total_loss = total_loss + loss(outputs[i],label[i])
            #backward step: compute gradients (apply automatic differentiation).
            total_loss.backward()
            loss_acc = total_loss / len(outputs)
            losses.append(loss_acc)
            #update the parameters using the already computed gradients
            optimizer.step()
    end_time = time.time()
    return losses, end_time - start_time

In [49]:
#test
def test_cycle(test_dataloader, model, batch_size):
  start_time = time.time()
  iterator = iter(test_dataloader)
  softmax = torch.nn.Softmax(dim=1)
  accuracies = []
  for data, label in iterator:
      # we convert a list of tensors (given as such by the dataloader) to a tensor
      data = torch.stack(data)
      # convert the tensor given in shape (sequence_len,batch_size,num_classes) to (batch_size,sequence_len,num_classes)
      data = torch.permute(data, (1,0,2))
      #forward step
      prediction = model(data)
      prediction = torch.stack(prediction)
      label = torch.stack(label)
      #compute accuracy
      pred_arg = torch.argmax(softmax(prediction),dim=2)
      label_arg = torch.argmax(label,dim=2)
      acc_list = (torch.sum(torch.eq(pred_arg, label_arg),1)/batch_size).tolist()
      mean_acc = sum(acc_list)/len(acc_list)
      accuracies.append(mean_acc)
  end_time = time.time()
  return sum(accuracies) / len(accuracies), end_time - start_time

In [33]:
from numpy import array
from sklearn.model_selection import KFold
import time

# prepare cross validation
def KFOLD(model, loss, optimizer, dataset, k):
    batch_size_train = 64
    batch_size_test = 128
    kfold = KFold(k)
    EPOCHS = 2
    i = 0
    mean_accuracy = []
    for train, test in kfold.split(dataset):
        # turn train and test folds to tensors that works with gpu
        list_train = [dataset[i] for i in train.tolist()]
        list_test = [dataset[i] for i in test.tolist()]
        #tensor_train = torch.from_numpy(dataset[train])
        #tensor_train = tensor_train.to(gpu)
        #tensor_test = torch.from_numpy(dataset[test])
        #tensor_test = tensor_test.to(gpu)
        # create dataloaders for train and test folds
        start_time = time.time()
        train_dataloader = DataLoader(list_train, batch_size=batch_size_train, shuffle=False)
        test_dataloader = DataLoader(list_test, batch_size=batch_size_test, shuffle=True)
        # perform test and training cycle on a fold, and record the time spent on computing
        losses, time_train = train_cycle(EPOCHS, train_dataloader, model, loss, optimizer)
        accuracy, time_test = test_cycle(test_dataloader, model, batch_size_test)
        mean_accuracy.append(accuracy)
        print("Split %d accuracy is: %s. %s seconds required for training, %s seconds required for testing." % (i,str(accuracy),str(time_train),str(time_test)))
        i = i + 1
    return sum(mean_accuracy) / len(mean_accuracy)

In [34]:
result_32 = KFOLD(model_32, loss_32, optimizer_32, dataset, k=3)

0.00013113021850585938

Epoch: 0
Epoch: 1
Split 0 accuracy is: 0.2462044270833333. 32.370535135269165 seconds required for training, 3.5931806564331055 seconds required for testing.
0.0003056526184082031

Epoch: 0
Epoch: 1
Split 1 accuracy is: 0.24925520833333334. 32.59132218360901 seconds required for training, 1.9677457809448242 seconds required for testing.
0.0002052783966064453

Epoch: 0
Epoch: 1
Split 2 accuracy is: 0.23403515625000007. 32.94490122795105 seconds required for training, 2.386234760284424 seconds required for testing.


In [35]:
result_1024 = KFOLD(model_1024, loss_1024, optimizer_1024, dataset, k=3)
print(result_1024)

0.00011372566223144531

Epoch: 0
Epoch: 1
Split 0 accuracy is: 0.20838151041666655. 44.18349885940552 seconds required for training, 3.6154592037200928 seconds required for testing.
0.0002970695495605469

Epoch: 0
Epoch: 1
Split 1 accuracy is: 0.23048307291666667. 45.57931423187256 seconds required for training, 3.489776372909546 seconds required for testing.
0.0002658367156982422

Epoch: 0
Epoch: 1
Split 2 accuracy is: 0.2672591145833334. 44.99214291572571 seconds required for training, 3.778024911880493 seconds required for testing.
0.2353745659722222


In [36]:
# generate random character from dictionary
dic_list = dictionary['data'].tolist()


In [44]:
# train the model
import time

start = time.time()

EPOCHS = 5
model = NeuralNetwork(input_size, 1024, output_size, gpu)
model.to(gpu)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_threshold = int(len(dataset)*0.8)
train_dataloader = DataLoader(dataset[:train_threshold], batch_size=64, shuffle=False)
losses = train_cycle(EPOCHS, train_dataloader, model, loss, optimizer)


end = time.time()
print("Time passed: " + str(end - start))
print(losses)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Time passed: 134.7096288204193
([tensor(4.2716, device='cuda:0', grad_fn=<DivBackward0>), tensor(4.2261, device='cuda:0', grad_fn=<DivBackward0>), tensor(4.1463, device='cuda:0', grad_fn=<DivBackward0>), tensor(3.3238, device='cuda:0', grad_fn=<DivBackward0>), tensor(3.1072, device='cuda:0', grad_fn=<DivBackward0>), tensor(3.0995, device='cuda:0', grad_fn=<DivBackward0>), tensor(3.0584, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9864, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9454, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9428, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9441, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9441, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9304, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9228, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9385, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9293, device='cuda:0', grad_fn=<DivBackward0>), tensor(2.9

In [50]:
# Compute the accuracy of the model using the test set
test_dataloader = DataLoader(dataset[train_threshold:], batch_size=64, shuffle=True)
accuracies = test_cycle(test_dataloader, model, batch_size=64)


print("Mean accuracy is: " + str(sum(accuracies)/len(accuracies)))

Mean accuracy is: 1.336129805246989


In [46]:
print(accuracies)

(0.2543489583333334, 3.139968156814575)


In [None]:
# this function encoded a string as an input fitting for the NN of size (SxN)
# * N = number of classes
# * S = length of the string (in terms of characters)
def generate_phrase_to_NN_input(num_classes, dictionary, device):
    # input: a string
    # output: a Sx72 one-hot tensor
    def phrase_to_NN_input(phrase):
        result = []
        for char in phrase:
            word_converted = torch.zeros(num_classes)
            word_index = dictionary.loc[dictionary['data'] == char]['int_encoding'].item()
            word_converted[word_index] = 1
            result.append(word_converted)
        result = torch.stack(result)
        result = result.to(device)
        return result
    return phrase_to_NN_input
phrase_to_NN_input = generate_phrase_to_NN_input(72,dictionary, gpu)

In [None]:
# this function decodes a string expressed as a tensor of shape SxN with values x\in{0,1}, to a string of length S
def generate_NN_output_to_phrase(num_classes, dictionary, device):
    # input: a string
    # output: a Sx72 one-hot tensor
    def NN_output_to_phrase(nn_output):
        result = ""
        for tensor_hot_enc in nn_output:
            idx_word = torch.argmax(tensor_hot_enc).item()
            char_decoded = dictionary.loc[dictionary['int_encoding'] == idx_word]['data'].item()
            result = result + char_decoded
        return result
    return NN_output_to_phrase
NN_output_to_phrase = generate_NN_output_to_phrase(72,dictionary, gpu)

In [None]:
wordminusminus_encoded = phrase_to_NN_input("--")
assert wordminusminus_encoded[0][4].item() == 1.0 and wordminusminus_encoded[1][4].item() == 1.0

In [None]:
wordminusminus_decoded = NN_output_to_phrase(wordminusminus_encoded)
assert wordminusminus_decoded == "--"

In [None]:
phrase = "eqwe"
phrase_encoded = phrase_to_NN_input(phrase)
phrase_encoded = phrase_encoded[:,None,:]
prediction = model_32(phrase_encoded)
prediction_decoded = NN_output_to_phrase(prediction[0])
print(prediction_decoded)

ìììì


## Text Generation Algorithm
We want to generate text starting from a single character input. Given a LSTM model M, we give it in input $x_0$ and get as output the tuple ($y_0$,$h_0$), we then proceed by feeding $x_1 = y_0$ to M as input: $(y_1,h_1) = M(x_1)$ and then keep going for k times.

In [None]:
iterator = iter(test_dataloader)
data, label = next(iterator)
prediction = torch.stack(model(data))
data_arg = torch.argmax(softmax(prediction).reshape(100,72),dim=1)
label_arg = torch.argmax(torch.stack(label).reshape(100,72),dim=1)
sum(torch.eq(data_arg, label_arg).tolist())/100

In [None]:
37.65237474441528

In [None]:
a = torch.tensor([[1,2],[2,3]])
print(a)

tensor([[1, 2],
        [2, 3]])


In [None]:
import torch
a = torch.tensor([2.0], requires_grad=True)
b = torch.tensor([3.0], requires_grad=True)

c = 0
for i in range(3):
    c = c + a
    print(c)

c = c + torch.log(b)

In [None]:
c.backward()

In [None]:
print(a.grad)
print(b.grad)

tensor([3.])
tensor([0.3333])


In [None]:
import numpy as np
import tensorflow as tf


a = tf.Variable(2.0)
b = tf.Variable(3.0)

with tf.GradientTape() as tape:
    c = 0
    for i in range(3):
        c = c + a
        print(c)

    c = c + tf.math.log(b)

tf.Tensor(2.0, shape=(), dtype=float32)
tf.Tensor(4.0, shape=(), dtype=float32)
tf.Tensor(6.0, shape=(), dtype=float32)


In [None]:
dy_da = tape.gradient(c, [a,b])
dy_da

[<tf.Tensor: shape=(), dtype=float32, numpy=3.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.33333334>]

In [None]:
print(c)

tf.Tensor(7.0986123, shape=(), dtype=float32)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import pandas as pd

data = ('a','b','c','c','a')
dictionary = pd.DataFrame(data, columns=['data'])

dictionary['int_encoding'] = labelencoder.fit_transform(dictionary['data'])
print(dictionary)

  data  int_encoding
0    a             0
1    b             1
2    c             2
3    c             2
4    a             0


In [None]:
one_hot_enc = OneHotEncoder()
one_hot = one_hot_enc.fit_transform(dictionary[['int_encoding']]).toarray()
print(dictionary[['int_encoding']])
print(one_hot)

   int_encoding
0             0
1             1
2             2
3             2
4             0
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [None]:
import torch
mat = torch.empty(5,100).to(gpu)
for i in range(0, 1000000):
    mat = torch.mm(mat, torch.transpose(mat,0,1))

In [None]:
mat = torch.empty(2,3)
a = torch.transpose(mat,0,1)
print(mat)
print(a)
print(torch.mm(mat,a))

tensor([[6.6648e-10, 1.1040e-05, 2.6081e+20],
        [2.0975e-07, 2.1876e-04, 4.3921e-05]])
tensor([[6.6648e-10, 2.0975e-07],
        [1.1040e-05, 2.1876e-04],
        [2.6081e+20, 4.3921e-05]])
tensor([[       inf, 1.1455e+16],
        [1.1455e+16, 4.9784e-08]])


In [None]:
a = torch.arange(10).reshape(5,2)
a
torch.split(a,1)
a.size(1)

2

In [None]:
a = torch.stack([torch.randn((1,2)),torch.randn((1,2))])
print(a.size())
a

torch.Size([2, 1, 2])


tensor([[[-0.1867, -0.5635]],

        [[-0.5525, -0.4179]]])

In [None]:
a = next(iter(train_dataloader))
b = torch.stack(a[0])
print(b.size(0))
print(b.size(1))
print(b.size(2))

100
128
72


IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)

In [None]:
loss = torch.nn.CrossEntropyLoss()
# pred=(a,c),(a,a), label=(a,b),(c,c)
pred = [torch.FloatTensor([10,20]),torch.FloatTensor([20,10])
softmax = torch.nn.Softmax(dim=0)
label = torch.FloatTensor([[softmax(pred)[0][0],softmax(pred)[0][1]],[softmax(pred)[1][0],softmax(pred)[1][1]]])
print(pred.size())
loss(pred,label)

AttributeError: 'list' object has no attribute 'softmax'

In [None]:
x = torch.ones(1, requires_grad=True)
y = x**2
z = x**3
w = x**3
z.backward()
w.backward()
x.grad

tensor([6.])

In [None]:
from torch import tensor
target = tensor([0, 1, 2, 3])
preds = tensor([0, 2, 1, 3])
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=4)
accuracy(preds, target)

AssertionError: 

In [None]:
y = torch.tensor([
     [
       [1, 0, 0],
       [1, 0, 0]
     ],
     [
       [0, 1, 0],
       [0, 0, 1]
     ],
     [
       [0, 0, 1],
       [0, 0, 1]
     ]
   ])

In [None]:
y.size()

torch.Size([3, 2, 3])

In [None]:
torch.argmax(y.float(),dim=2)

tensor([[0, 0],
        [1, 2],
        [2, 2]])

In [None]:
print(dictionary)
print(len(dictionary))
a = dictionary.loc[dictionary['data'] == 'b']['int_encoding']
print(a[0])

        data  int_encoding
0          b            39
1          e            42
2          n            51
4                        1
6          u            58
...      ...           ...
1099082    -             4
1099618    U            33
1100751    È            64
1101278    N            26
1104174    V            34

[72 rows x 2 columns]
72
39


##