Sources:
* https://www.andrew.cmu.edu/course/18-661/lectures/pytorch.pdf
* https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183
* https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
* https://coderzcolumn.com/tutorials/artificial-intelligence/how-to-use-glove-embeddings-with-pytorch#3
* https://huggingface.co/course/chapter1/4?fw=pt

# Prepare Environment

In [None]:
!nvidia-smi

In [None]:
!pip install --upgrade pip
!pip uninstall -y fastai
!pip install torch==2.0.0 transformers[torch]==4.27.3 datasets==2.10.1 torchtext livelossplot tqdm

In [3]:
import torch
torch.__version__

'2.0.0+cu117'

## TF-IDF & LogisticRegression

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import datasets

In [None]:
ds = datasets.load_dataset('imdb')
ds

We starting from preparing text data to some vector view for fitting neural network. One of most simplest and popular method - [Term Frequency-Inverse Document Frequency (TF-IDF)](https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183).

In [None]:
VOCAB_SIZE = 16384

tfidf_vectorizer = TfidfVectorizer(max_features=VOCAB_SIZE, stop_words="english")
tfidf_vectorizer.fit(ds['train']['text'])

In [7]:
train_X_tensor = torch.tensor(tfidf_vectorizer.transform(ds['train']['text']).todense(), dtype=torch.float32)
train_Y_tensor = torch.tensor(ds['train']['label'], dtype=torch.float32).reshape(-1, 1)

In [None]:
train_X_tensor

In [None]:
train_Y_tensor

In [10]:
import numpy as np

# Shuffle train data
shuffle_train_index = np.random.shuffle(np.arange(train_X_tensor.shape[0]))
train_X_tensor = train_X_tensor[shuffle_train_index]
train_Y_tensor = train_Y_tensor[shuffle_train_index]

In [None]:
test_X_tensor = torch.tensor(tfidf_vectorizer.transform(ds['test']['text']).todense(), dtype=torch.float32)
test_Y_tensor = torch.tensor(ds['test']['label'], dtype=torch.float32).reshape(-1, 1)

print(train_X_tensor.shape, train_Y_tensor.shape, test_X_tensor.shape, test_Y_tensor.shape)

In [12]:
class LogisticRegression(torch.nn.Module):
     def __init__(self, vocab_size):
        # init super
       #  linear layer

     def forward(self, x):
        # apply linear and sigmoid
         return outputs

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
from tqdm.auto import tqdm

#init device 
#init model
#count parameters
#init binary cross entropy
#init SGD optimizer
model = torch.compile(model)


In [None]:
# set num steps
# iterative over num steps and train model

for step in range(num_steps):

  # your code

  loss = ...
  
  if step % 100 == 0:

      # your code
      # evaluate model 
      test_loss = 

    print(f"step={step:4d}; train_loss={loss.cpu().detach().item():3.6f}; test_loss={test_loss.cpu().detach().item():3.6f}")

Lets rewrite our code to really using mini-batch SGD! by [`torch.utils.data.DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)

In [15]:
class IMDBTfidfDataset(torch.utils.data.Dataset):
  def __init__(self, dataset, tfidf_vectorizer):
    super().__init__()
    self.dataset = dataset
    self.tfidf_vectorizer = tfidf_vectorizer

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    
    # YOUR CODE HERE

    return {
        'text': # CAST TO TENSOR
        'label': # CAST TO TENSOR
    }

In [16]:
train_dataset = IMDBTfidfDataset(
    ds['train'].train_test_split(test_size=0.1)['test'],
    tfidf_vectorizer,
  )
test_dataset = IMDBTfidfDataset(
    ds['test'].train_test_split(test_size=0.1)['test'],
    tfidf_vectorizer,
)

for batch in torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True):
  print(batch['text'].shape, batch['label'].shape)
  break

torch.Size([32, 1, 16384]) torch.Size([32, 1])


## Go Deeper

In [None]:
class DFFNN(torch.nn.Module):
  def __init__(self, hidden_size):
    super().__init__()
    # MAKE DEEEPER NN

  def forward(self, x):
    # APPLY IT


def fit_model_epoch(model, dataloader, optimizer, criterion) -> float:
    # TRAIN MODEL AND RETURN MEAN LOSS

def evaluate_model(model, dataloader, criterion) -> float:
    # EVALUATE AND RETURN MEAN LOSS


hidden_size = ..
model = DFFNN(hidden_size).to(device)
print(f"Model size: {count_parameters(model)} parameters")
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

NUMBER_OF_EPOCHES = 10
for epoch in range(NUMBER_OF_EPOCHES):
    train_loss = fit_model_epoch(model, train_loader, optimizer, criterion)
    test_loss = evaluate_model(model, test_loader, criterion)

    print(epoch, train_loss, test_loss)

## Dropout

In [None]:
# https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html?highlight=dropout#torch.nn.Dropout

class DFFNNWithDropout(torch.nn.Module):
    def __init__(self, hidden_size, dropout_p):
        # MAKE NN WITH DROPOUT 
        # THINK ABOUT THE PROPER PLACE FOR DROPOUT

    def forward(self, x):
        # APPLY NET

hidden = ...
drp = ...
model = DFFNNWithDropout(hidden, drp).to(device)
print(f"Model size: {count_parameters(model)} parameters")
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

NUMBER_OF_EPOCHES = 10
for epoch in range(NUMBER_OF_EPOCHES):
    train_loss = fit_model_epoch(model, train_loader, optimizer, criterion)
    test_loss = evaluate_model(model, test_loader, criterion)

    print(epoch, train_loss, test_loss)

## Using better gradient descent algorithm (Adam)

In [None]:
hidden = ..
drp = ../
model = DFFNNWithDropout(hidden, drp).to(device)
print(f"Model size: {count_parameters(model)} parameters")
criterion = torch.nn.BCELoss()
# INIT ADAM OPTIMIZER


train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

NUMBER_OF_EPOCHES = 5
for epoch in range(NUMBER_OF_EPOCHES):
    train_loss = fit_model_epoch(model, train_loader, optimizer, criterion)
    test_loss = evaluate_model(model, test_loader, criterion)

    print(epoch, train_loss, test_loss)

## Learning rate annealing

In [None]:
from torch.optim.lr_scheduler import ExponentialLR

hidden = 32
drp = 0.3
model = DFFNNWithDropout(hidden, drp).to(device)
print(f"Model size: {count_parameters(model)} parameters")
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
scheduler = ExponentialLR(optimizer, gamma=0.9)


train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)


NUMBER_OF_EPOCHES = 10
for epoch in range(NUMBER_OF_EPOCHES):
  train_loss = fit_model_epoch(model, train_loader, optimizer, criterion)
  test_loss = evaluate_model(model, test_loader, criterion)
  scheduler.step()
  current_lr = optimizer.param_groups[0]["lr"]
  print(epoch, current_lr, train_loss, test_loss)