In [2]:
# settings for tutorial presentation with RISE
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
              'width': '100%',
              'height': '100%',
              'scroll': True,
              'enable_chalkboard': True,
})


ModuleNotFoundError: No module named 'notebook'

# ASDS 2

## Tutorial: PyTorch basics

Clara Vandeweerdt (based on work by Anna Rogers)

# Why PyTorch?

* TensorFlow (Google)   
* PyTorch (Facebook)  
* Keras (François Chollet)

![alternative text](figures/num_hf_models_2023.png)


![alternative text](figures/Fraction-of-Papers-Using-PyTorch-vs.-TensorFlow.png)


In [None]:
# package setup
!pip install sklearn numpy torch datasets

In [3]:
import torch
import torch.utils.data
import torch.nn.functional as F
import scipy
import pandas as pd
import numpy as np
from datasets import load_dataset, logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
# enabling inline plots in Jupyter
%matplotlib inline
# disabling verbose messages from dataset library
logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


# Introduction to PyTorch

Just like NumPy, PyTorch provides basic functions for creating tensors and common operations on them. Tensors (in this context) are just n-dimensional arrays of numbers. So they can contain vectors, matrices and so on.

In [32]:
# creating a tensor with numbers 1--5
a = torch.FloatTensor([1,2,3,4,5])
# same but in descending order
b = torch.FloatTensor([5,4,3,2,1])

In [33]:
a

tensor([1., 2., 3., 4., 5.])

In [34]:
b

tensor([5., 4., 3., 2., 1.])

In [35]:
# basic operations mostly work like in numpy
a + b

tensor([6., 6., 6., 6., 6.])

In [8]:
#we can also convert numpy arrays to torch tensors
c = np.array([1, 2, 3, 4])
print("type before conversion:", type(c))
c = torch.from_numpy(c)
print("type after conversion:", type(c))
print(c)

type before conversion: <class 'numpy.ndarray'>
type after conversion: <class 'torch.Tensor'>
tensor([1, 2, 3, 4])


## Loading the tweet_eval classification dataset

In [38]:
# load the 3-class sentiment classification model from tweet_eval
train = load_dataset('tweet_eval', 'sentiment', split='train')
val = load_dataset('tweet_eval', 'sentiment', split='validation')
test = load_dataset('tweet_eval', 'sentiment', split='test')

# vectorizing the data with TF-IDF corpus
vectorizer = TfidfVectorizer() # the default ngram range is (1,1)

train_corpus = [x["text"] for x in train][:10000]
train_labels = np.array([x["label"] for x in train][:10000])
train_features = vectorizer.fit_transform(train_corpus).toarray()

val_corpus = [x["text"] for x in val]
val_labels = np.array([x["label"] for x in val])
val_features = vectorizer.transform(val_corpus).toarray()

## Converting tweet_eval data to torch tensors

In this exercise, we will be working with a new HuggingFace dataset: `tweet_eval`. It contains tweets, hand-classified into three categories: negative, neutral, and positive. After loading the dataset (not shown), and turning the tweets into TF-IDF features, we need to convert the features and labels to torch tensors.


In [39]:
# torch.from_numpy turns these from np.arrays into torch.tensors
x_train = torch.from_numpy(train_features).float()
y_train = torch.from_numpy(train_labels)

In [11]:
#same for the test data
x_test = torch.from_numpy(val_features).float()
y_test = torch.from_numpy(val_labels)

# Datasets and a recap of classes

In PyTorch, we normally put data into objects of the [Dataset](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.Dataset) class. When we create PyTorch models, those models will also be classes. That's why we could use a recap of classes in Python.

## Classes in Python

Almost everything in Python is an object. Objects have classes that tell us what kind of objects they are. The class of an object determines what *attributes* it can have (for example, an object of class Pet can have its species as an attribute), and what *methods* we can apply to it (what it can do; for example, we can ask any Pet to tell us what species it is.)

In [43]:
class Pet:
 
    # What happens when we create an instance of Pet
    def __init__(self, pet_species):
        self.species = pet_species
 
    # A method for a Pet (something it can do)
    def whatareyou(self):
        print("I'm a", self.species)
 
 
# Object instantiation
Rodger = Pet("poodle")

# Using class attributes and methods
print(Rodger.species)

poodle


In [42]:
Rodger.whatareyou()

I'm a poodle


We can also create subclasses. Subclasses have all the same attributes and methods as their parent class, and then we can modify them or add some.

In [44]:
class Dog(Pet):
    
    # A method for a Dog (something it can do)
    def bark(self):
        print("woof woof")

# Object instantiation
Jazz = Dog("poodle")

# Using class attributes and methods
print(Jazz.species)
Jazz.whatareyou()
Jazz.bark()

poodle
I'm a poodle
woof woof


## The Dataset Class

We are going to define our own subclass `TweetEvalData` that is a special case of the `Dataset` class.

In [46]:
# Making a subclass of the torch Datasets class

class TweetEvalData(torch.utils.data.Dataset):

    def __init__(self, X, y): # describes how the dataset is initialized; the arguments (when initializing) are the features and labels
        self.X = X #any instance of the TweetEvalData class will have an attribute X that contains its features 
        self.y = y #same with an attribute y that contains its labels

    def __getitem__(self, index): # getitem allows to retrieve a datapoint from the dataset by its index.
        X = self.X[index] 
        y = self.y[index].unsqueeze(0) #tensor is unsqueezed to ensure correct shape for training
        return X, y # methods returns corresponding data point to input index by an x and y tensor 

    # a helper to check and return the size of the dataset
    def __len__(self):
        return len(self.y) # Returning the number of labels in the data


`TweetEvalData` is a class, so to use it you need to instantiate it.

In [15]:
# Initializing datasets

dataset_class_train = TweetEvalData(x_train, y_train) # initiating an instance of the class that contains the train data
dataset_class_val = TweetEvalData(x_test, y_test) #initating an instance of the class that contains the test data

We also need so-called `Dataloaders` for each of these data sets. A `Dataloader` allows us to easily iterate over samples of data and corresponding labels during training and evaluation of our model.

In [47]:
# Initiating dataloader

train_loader = torch.utils.data.DataLoader(dataset_class_train, batch_size = 64)
val_loader = torch.utils.data.DataLoader(dataset_class_val, batch_size = 64)

Batch size is the number of samples that are processed before the model is updated while training; each step of our gradient descent is based on one batch of data.

# Defining and training a model in PyTorch

## Basic model architecture: a linear classifier

All the models we will implement in PyTorch will be subclasses of the existing [`torch.nn.Module` class](https://pytorch.org/docs/stable/nn.html?highlight=module#torch.nn.Module). In the `__init__` method of your model should define all the layers you are going to use. The `forward()` method defines the order of the layers, and so, how the model should produce outputs given the inputs.

In [17]:
# define a class for a linear classifier
class LinearClassifier(torch.nn.Module):
    # initialization parameters
    def __init__ (self, n_features, n_classes):
        super().__init__()
        # we will have only one linear layer which takes the given number of features as its inputs,
        # and outputs a score for each of the given number of classes
        self.linear = torch.nn.Linear(n_features, n_classes)

    # you always need to define the forward() method which defines how your model performs
    # forward propagation
    def forward(self, x):
        linear_out = self.linear(x)
        return linear_out

## Setting things up for training

1) instantiating the model
2) choosing a loss function (e.g. [Mean Squared Error loss](https://pytorch.org/docs/stable/nn.html#torch.nn.MSELoss) or [Cross Entropy loss](https://pytorch.org/docs/stable/nn.html#torch.nn.CrossEntropyLoss)) and computing the loss
3) choosing an optimizer--we will be working with the Adam optimizer


In [24]:
# instantiate the model with the input size equal to the number of features in the data
myLC = LinearClassifier(n_features=18484, n_classes=3)

# setting up the loss function component 
# which will implicitly perform softmax on linear layer outputs
loss_function = torch.nn.CrossEntropyLoss()
# setting up the optimizer (stochastic gradient descent)
optimizer = torch.optim.Adam(myLC.parameters(), lr=0.001)

## The training loop

For every batch (subset of data points):

1) feed the inputs to the model to get predictions

2) compare to targets (labels) to compute the loss

3) compute the gradients based on this loss using `loss.backward()`. This is where PyTorch does magic with automatic differentiation

4) update the weights of the model using `optimizer.step()`

Repeat (go over the whole dataset in batches again) for as many "epochs" as you want.

In [25]:
for epoch in range(5):
    losses = []
    for batch_index, (inputs, targets) in enumerate(train_loader):    
    
        #backward function accumulates gradients by default
        # use .zero_grad() to start from scratch each time
        optimizer.zero_grad()

        # Forward pass
        outputs = myLC(inputs)
        # Compute Loss, given the true labels for the training data
        targets = torch.flatten(targets)
        targets = targets.type(torch.LongTensor) # Converting targets as required for loss function
        loss = loss_function(outputs, targets)
        
        # Backward pass
        # performs a parameter update based on the current gradient (stored in .grad attribute of a parameter)
        # and the update rule
        loss.backward()
        optimizer.step()
        
        # keeping track of the loss values
        losses.append(loss.item())
        
    print(np.mean(losses))

1.0561096501198544
0.9913876079449988
0.9516727871196285
0.9204518233135248
0.892812967300415


# Evaluating the trained model

Once the model is trained, we can get predictions for the test set and check accuracy. A key difference with training is that the backward pass is no longer needed.

In [52]:
# disabling gradient updates 
# this will reduce memory consumption, and is a good practice at inference time
predictions = []
with torch.no_grad():
    for batch_index, (inputs, targets) in enumerate(val_loader):
    
        # making predictions on the test set and evaluating the model
        outputs = myLC(inputs)
    
        # now we want accuracy, not loss. So we need an actual prediction:
        # we will predict the class with the highest score
        vals, indices = torch.max(outputs, 1)
        predictions += indices.tolist()

acc = accuracy_score(predictions, val_labels)
print(f'Model accuracy: {acc}')

Model accuracy: 0.58
