<a href="https://colab.research.google.com/github/Otobi1/Back-to-Basics-A-Refresher-/blob/master/Back_to_Basics_CNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
## Set up 

import numpy as np 
import pandas as pd
import random 
import torch 
import torch.nn as nn

In [3]:
SEED = 1234

In [4]:
def set_seeds(seed = 1234):
  """Set seed for reproducibility."""
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed) # multi GPU

In [5]:
# Set seed for reproducibility 

set_seeds(seed=SEED)

In [6]:
# Set device 

cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
  torch.set_default_tensor_type("torch.cuda.FloatTensor")
print (device)

cuda


In [7]:
# Load data 
# - corpus of news article from http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html 

url = "https://raw.githubusercontent.com/GokuMohandas/madewithml/main/datasets/news.csv"
df = pd.read_csv(url, header = 0) # load
df = df.sample(frac = 1).reset_index(drop = True) # shuffle
df.head()

Unnamed: 0,title,category
0,Sharon Accepts Plan to Reduce Gaza Army Operat...,World
1,Internet Key Battleground in Wildlife Crime Fight,Sci/Tech
2,July Durable Good Orders Rise 1.7 Percent,Business
3,Growing Signs of a Slowing on Wall Street,Business
4,The New Faces of Reality TV,World


In [8]:
# Preprocessing 
# - to clean up the data, convert to lower text, remove filler words and filter using regex.

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [9]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we']


In [10]:
def preprocess(text, stopwords = STOPWORDS):
  """Conditional preprocessing on our text unique to the task."""
  # Lower 
  text = text.lower()

  # Remove stopwords
  pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
  text = pattern.sub(" ", text)

  # Remove words in parenthesis
  text = re.sub(r"\([^)]*\)", " ", text)

  # Spacing and filters 
  text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
  text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric characters
  text = re.sub(' +', ' ', text) # remove multiple spaces 
  text = text.strip()

  return text

In [11]:
# Sample 

text = "Great week for the NYSE"
preprocess(text = text)

'great week nyse'

In [12]:
# Apply to dataframe

preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print (f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")

Sharon Accepts Plan to Reduce Gaza Army Operation, Haaretz Says

sharon accepts plan reduce gaza army operation haaretz says


In [13]:
# if you have preprocessing steps like standardisation, that are calculated, you need to separate the training an dtest set first before applying those operations. 
# this is because we cannot apply any knowledge gained from the test set accidentally (data leak during preprocessing/training). 
# - for global preprocessing steps like the functin above, where we arent learning anything from the data itself, we can perform them before splitting the data.

In [14]:
# Split the data 

import collections
from sklearn.model_selection import train_test_split

In [15]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [16]:
def train_val_test_split(X, y, train_size):
  """Split dataset into data split."""
  X_train, X_, y_train, y_ = train_test_split(X, y, train_size = TRAIN_SIZE, stratify = y)
  X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size = 0.5, stratify = y_)
  return X_train, X_val, X_test, y_train, y_val, y_test

In [17]:
# Data
X = preprocessed_df["title"].values 
y = preprocessed_df["category"].values

In [18]:
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X = X, y = y, train_size = TRAIN_SIZE)

print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_train: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} -> {y_train[0]}")


X_train: (84000,), y_train: (84000,)
X_val: (18000,), y_val: (18000,)
X_train: (18000,), y_test: (18000,)
Sample point: china battles north korea nuclear talks -> World


In [19]:
# Label Encoding 
# to encode the text labels into unique indices

import itertools

In [20]:
class LabelEncoder(object):
  """Label encoder for tag labels."""
  def __init__(self, class_to_index = {}):
    self.class_to_index = class_to_index
    self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    self.classes = list(self.class_to_index.keys())

  def __len__(self):
    return len(self.class_to_index)

  def __str__(self):
    return f"<LabelEncoder(num_classes = {len(self)})>"
  
  def fit(self, y):
    classes = np.unique(y_train)
    for i, class_ in enumerate(classes):
      self.class_to_index[class_] = i
    self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    self.classes = list(self.class_to_index.keys())
    return self

  def encode(self, y):
    encoded = np.zeros((len(y)), dtype = int)
    for i, item in enumerate(y):
      encoded[i] = self.class_to_index[item]
    return encoded

  def decode(self, y):
    classes = []
    for i, item in enumerate(y):
      classes.append(self.index_to_class[item])
    return classes

  def save(self, fp):
    with open(fp, "w") as fp:
      contents = {"class_to_index": self.class_to_index}
      json.dump(contents, fp, indent = 4, sort_keys = False)

  @classmethod
  def load(cls, fp):
    with open(fp, "r") as fp:
      kwargs = json.load(fp = fp)
    return clas(**kwargs)

In [21]:
# Encode 

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

In [22]:
# Converting labels to tokens 

print (f"y_train[0]: {y_train[0]}")

y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)

print (f"y_train[0]: {y_train[0]}")

y_train[0]: World
y_train[0]: 3


In [23]:
# Class weights 

counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"counts: {counts}\nweights: {class_weights}")

counts: [21000 21000 21000 21000]
weights: {0: 4.761904761904762e-05, 1: 4.761904761904762e-05, 2: 4.761904761904762e-05, 3: 4.761904761904762e-05}


In [24]:
# Tokenizer 

import json
from collections import Counter
from more_itertools import take

In [25]:
class Tokeniser(object):
  def __init__(self, char_level, num_tokens = None,
               pad_token = "<PAD>", oov_token = "<UNK>", 
               token_to_index = None):
    self.char_level = char_level
    self.separator = " " if self.char_level else " "
    if num_tokens: num_tokens -= 2 # pad + unk tokens
    self.num_tokens = num_tokens
    self.oov_token = oov_token
    if not token_to_index:
      token_to_index = {"<PAD>": 0, "<UNK>": 1}
    self.token_to_index = token_to_index
    self.index_to_token = {v: k for k, v in self.token_to_index.items()}

  def __len__(self):
    return len(self.token_to_index)

  def __str__(self):
    return f"<Tokeniser(num_tokens = {len(self)})>"

  def fit_on_texts(self, texts):
    if self.char_level:
      all_tokens = [token for text in texts for token in text]
    if not self.char_level:
      all_tokens = [token for text in texts for token in text.split(" ")]
    counts = Counter(all_tokens).most_common(self.num_tokens)
    self.min_token_freq = counts[-1][1]
    for token, count in counts:
      index = len(self)
      self.token_to_index[token] = index
      self.index_to_token[index] = token
    return self

  def texts_to_sequence(self, texts):
    sequences = []
    for text in texts:
      if not self.char_level:
        text = text.split(' ')
      sequence = []
      for token in text: 
        sequence.append(self.token_to_index.get(
            token, self.token_to_index[self.oov_token]))
      sequences.append(np.asarray(sequence))
    return sequences
  
  def sequences_to_texts(self, sequences):
    texts = []
    for sequence in sequences:
      text = []
      for index in sequence :
        text.append(self.index_to_token.get(index, self.oov_token))
      texts.append(self.separator.join([token for token in text]))
    return texts
  
  def save(self, fp):
    with open(fp, "w") as fp:
      contents = {
          "char_level": self.char_level, 
          "oov_token": self.oov_token, 
          "token_to_index": self.token_to_index
      }
      json.dump(contents, fp, indent = 4, sort_keys = False)

  @classmethod
  def load(cls, fp):
    with open(fp, "r") as fp:
      kwargs = json.load(fp = fp)
    return cls(**kwargs)

In [26]:
## - we will restrict the number of tokens in our tokenizer to the top 500 most frequent tokens (stop words already removed)
# -- because the full vocabulary (approx 30k) is too large to run on google colab

# ** it is important that we are only using the training data split because during inference, the model will not always know every token
# -- so it is important to replicate that scenario with the validation and test split. 

In [27]:
# Tokenise

tokeniser = Tokeniser(char_level = False, num_tokens = 500)
tokeniser.fit_on_texts(texts = X_train)
VOCAB_SIZE = len(tokeniser)

print (tokeniser)

<Tokeniser(num_tokens = 500)>


In [28]:
# Sample of tokens 

print (take(5, tokeniser.token_to_index.items()))
print (f"least freq tokens freq: {tokeniser.min_token_freq}") # use this to adjust num tokens

[('<PAD>', 0), ('<UNK>', 1), ('39', 2), ('b', 3), ('gt', 4)]
least freq tokens freq: 166


In [29]:
# Convert texts to sequences of indices 

X_train = tokeniser.texts_to_sequence(X_train)
X_val = tokeniser.texts_to_sequence(X_val)
X_test = tokeniser.texts_to_sequence(X_test)

preprocessed_text = tokeniser.sequences_to_texts([X_train[0]])[0]
print ("Text to indices: \n"
    f" (preprocessed) -> {preprocessed_text}\n"
    f" (tokenised) -> {X_train[0]}")

Text to indices: 
 (preprocessed) -> china <UNK> north korea nuclear talks
 (tokenised) -> [ 16   1 285 142 114  24]


In [30]:
# One-hot Encoding 

# - Creates a binary column fro each unique value of each feature. 
# -- All the values for the token will be 0 except the index of that specific token

In [31]:
def to_categorical(seq, num_classes):
  """One-hot encode a sequence of tokens."""
  one_hot = np.zeros((len(seq), num_classes))
  for i, item in enumerate(seq):
    one_hot[i, item] = 1.
  return one_hot

In [32]:
# One-hot encoding 
print (X_train[0])
print (len(X_train[0]))
cat = to_categorical(seq = X_train[0], num_classes = len(tokeniser))

print (cat)
print (cat.shape)

[ 16   1 285 142 114  24]
6
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(6, 500)


In [33]:
# Convert tokens to one-hot

vocab_size = len(tokeniser)
X_train = [to_categorical(seq, num_classes = vocab_size) for seq in X_train]
X_val = [to_categorical(seq, num_classes = vocab_size) for seq in X_val]
X_test = [to_categorical(seq, num_classes = vocab_size) for seq in X_test]

In [34]:
# Padding 

# - all the inputs have varying lengths, but each batch needs to e uniformly shaped
# - we can use padding to make all the inputs in the batch the same length
# - the padding index will be 0

## ** one-hot encoding creates a batch of shape (N, max_seq_len, vocab_size) so we'll need to pad 3D sequences 

In [35]:
def pad_sequences(sequences, max_seq_len = 0):
  """Pad sequences to max length in sequence."""
  max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
  num_classes = sequences[0].shape[-1]
  padded_sequences = np.zeros((len(sequences), max_seq_len, num_classes))
  for i, sequence in enumerate(sequences):
    padded_sequences[i][:len(sequence)] = sequence
  return padded_sequences

In [36]:
# 3D sequences 

print (X_train[0].shape, X_train[1].shape, X_train[2].shape)
padded = pad_sequences(X_train[0:3])
print (padded.shape)

(6, 500) (5, 500) (6, 500)
(3, 6, 500)


In [37]:
# Dataset
# - here we need to create datasets and dataloaders to be able to efficiently create batches with the data splits 

FILTER_SIZE = 1 # unigram

In [38]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, X, y, max_filter_size):
    self.X = X
    self.y = y
    self.max_filter_size = max_filter_size

  def __len__(self):
    return len(self.y)

  def __str__(self):
    return f"<Dataset(N = {len(self)})>"

  def __getitem__(self, index):
    X = self.X[index]
    y = self.y[index]
    return [X, y]

  def collate_fn(self, batch):
    """Processing on batch."""
    # Get inputs
    X = np.array(batch, dtype = object)[:, 0]
    y = np.stack(np.array(batch, dtype = object)[:, 1], axis = 0)

    # Pad sequences 
    X = pad_sequences(X, max_seq_len = self.max_filter_size)

    # Cast
    X = torch.FloatTensor(X.astype(np.int32))
    y = torch.LongTensor(y.astype(np.int32))

    return X, y

  def create_dataloader(self, batch_size, shuffle = False, drop_last = False):
    return torch.utils.data.DataLoader(
        dataset = self, batch_size = batch_size, collate_fn = self.collate_fn, 
        shuffle = shuffle, drop_last = drop_last, pin_memory = True)

In [39]:
# Create datasets for embedding 

train_dataset = Dataset(X = X_train, y = y_train, max_filter_size = FILTER_SIZE)
val_dataset = Dataset(X = X_val, y = y_val, max_filter_size = FILTER_SIZE)
test_dataset = Dataset(X = X_test, y = y_test, max_filter_size = FILTER_SIZE)
print ("Datasets: \n"
    f" Train dataset: {train_dataset.__str__()}\n"
    f" Val dataset: {val_dataset.__str__()}\n"
    f" Test dataset: {test_dataset.__str__()}\n"
    "Sample Point: \n"
    f" X: {test_dataset[0][0]}\n"
    f" y: {test_dataset[0][1]}")

Datasets: 
 Train dataset: <Dataset(N = 84000)>
 Val dataset: <Dataset(N = 18000)>
 Test dataset: <Dataset(N = 18000)>
Sample Point: 
 X: [[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
 y: 1


In [40]:
# Create dataloaders 

batch_size = 64
train_dataloader = train_dataset.create_dataloader(batch_size = batch_size)
val_dataloader = val_dataset.create_dataloader(batch_size = batch_size)
test_dataloader = test_dataset.create_dataloader(batch_size = batch_size)
batch_X, batch_y = next(iter(test_dataloader))
print ("Sample batch:\n"
    f" X: {list(batch_X.size())}\n"
    f" y: {list(batch_y.size())}\n"
    "Sample point:\n"
    f" X: {batch_X[0]}\n"
    f" y: {batch_y[0]}")

Sample batch:
 X: [64, 14, 500]
 y: [64]
Sample point:
 X: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cpu')
 y: 1


In [41]:
# CNN - Convolutional Neural Networks 

# - Here we will learn about CNNs by applying them on 1D text data
# In the example below, we have a batch of N samples where wach sample has 8 characters and each char represented by an array of 10 values (vocab size = 10)
# - this gives our inputs the size (N, 8, 10)

# -- with PyTorch, when dealing with convs, the inputs X need to have the channels as the second dimension, so our inputs will be (N,10,8)

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [42]:
# Assume all our inputs are padded to have the same words

batch_size = 64
max_seq_len = 8 # words per input
vocab_size = 10 # one hot size
x = torch.randn(batch_size, max_seq_len, vocab_size)
print (f" X:{x.shape}")
x = x.transpose(1, 2)
print (f" X:{x.shape}")

 X:torch.Size([64, 8, 10])
 X:torch.Size([64, 10, 8])


In [43]:
# At the core of CNNs are filters, also known as weights, kernels etc), which convolve (slide) across our input to extract relevant features. 
# the filters are initialised randomly but learn to act as feature extractors via parameter sharing.

# We will use a conv1d layer to process our inouts 

In [44]:
# Convolutional filters (Valid padding)

vocab_size = 10 # one-hot size
num_filters = 50 # num of filters 
filter_size = 3 # filters are 3 X 3 
stride = 1
padding = 0 # valid padding (no padding)
conv1 = nn.Conv1d(in_channels = vocab_size, out_channels = num_filters, 
                  kernel_size = filter_size, stride = stride, 
                  padding = padding, padding_mode = "zeros")
print ("conv: {}".format(conv1.weight.shape))

conv: torch.Size([50, 10, 3])


In [45]:
# Forward pass
z = conv1(x)
print (f"z: {z.shape}")

z: torch.Size([64, 50, 6])


In [46]:
# Now, we'll add padding so that the convolutional outputs are the same shape as our inputs. 
# - we want our output to have the same width as our input. 

In [47]:
# Convolutional filters (Same padding)

vocab_size = 10 # one-hot size
num_filters = 50 # num filters 
filter_size = 3 # filters are 3 X 3
stride = 1
conv = nn.Conv1d(in_channels = vocab_size, out_channels = num_filters, 
                 kernel_size = filter_size, stride = stride)
print ("conv: {}".format(conv.weight.shape))

conv: torch.Size([50, 10, 3])


In [48]:
# Same padding 

padding_left = int((conv.stride[0] * (max_seq_len-1) - max_seq_len + filter_size) / 2)
padding_right = int(math.ceil((conv.stride[0] * (max_seq_len - 1) - max_seq_len + filter_size) / 2))
print (f"padding: {(padding_left, padding_right)}")

padding: (1, 1)


In [49]:
# Forward pass 

z = conv(F.pad(x, (padding_left, padding_right)))
print (f"z: {z.shape}")

z: torch.Size([64, 50, 8])


In [51]:
# Pooling 

# - the result of the convolving filters on an input is a feature map. Due to the nature of convolution and overlaps,our feature map will have lots of redundant information.
# - Pooling is a way to summarise a high-dimensional feature map into a lower dimensional one for simplified downstream computation. 
# -- the pooling operation can be the max value, average etc

# Max pooling 

pool_output = F.max_pool1d(z, z.size(2))
print ("Size: {}".format(pool_output.shape))

Size: torch.Size([64, 50, 1])


In [52]:
# Batch normalisation 

# - this is an operation that will standardise the activations from the previous layer 
# recall that we've previously standardised our inputs so that the model can optimise quickly with larger learning learning rates
# - here we will use the same concept  but we will continue to maintain standardised values throughout the forward pass to further aid optimisation

In [53]:
batch_norm = nn.BatchNorm1d(num_features = num_filters)
z = batch_norm(conv(x)) # applied to activations (after conv layer & before pooling)
print (f"z: {z.shape}")

z: torch.Size([64, 50, 6])


In [54]:
# Mean and std before batchnorm
print (f"mean: {torch.mean(conv1(x)):.2f}, std: {torch.std(conv(x)):.2f}")

mean: 0.01, std: 0.57


In [55]:
# Mean and std after batchnorm
print (f"mean: {torch.mean(z):.2f}, std: {torch.std(z):.2f}")

mean: 0.00, std: 1.00
