# Hand-written digit recognition

In [244]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [245]:
mnist_train = datasets.MNIST(root='~/data',train=True, download=True)
mnist_test = datasets.MNIST(root='~/data',train=False, download=True)

In [247]:
len(mnist_train)

60000

In [246]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as opt
import torch.nn.functional as F
import string

In [None]:
class MNIST_data(Dataset):
    def __init__(self, data):
        self.data = data
        s
    
    def __len__(self):
        return len

# Data preprocessing

In [1]:
## The data is available in PTB format 
## We will first convert them into label,text format

In [172]:
from nltk.tree import Tree
from sklearn.metrics import accuracy_score

In [10]:
# If you are not familiar with parsing, just use this function as black-box
# It returns two lists - sentences - (the text to classify) and labels - (the corresponding sentiment labels 0/1)
def get_data(fname):
    labels = []
    sentences = []
    with open(fname, encoding='utf8') as fs:
        for line in fs:
            label = 0 if int(line[1])<2 else 1
            sentence = ' '.join(Tree.fromstring(line.strip()).leaves())
            labels.append(label)
            sentences.append(sentence)
    return sentences, labels

In [11]:
train_sents, train_labels = get_data('Data/SST_trees/train.txt')

In [12]:
test_sents, test_labels = get_data('Data/SST_trees/test.txt')

In [13]:
val_sents, val_labels = get_data('Data/SST_trees/dev.txt')

In [14]:
train_sents[0]

"The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."

In [15]:
train_labels[0]

1

In [17]:
## We have to represent the words with some feature vectors, they cannot be directly used as input to the model
## There are a few ways of doing it - 
#1. Character-level encoding - represent each character by a 1-hot encoding and stack each character on top of the 
# other in order og appearence to create a 2d matrix
#2. Use some pretrained embedding of the words and then statck one over the other to form a 2d matrix as input
#3. Start with some random initialization of the embeddings and then learn the embeddings along the way 
# This works for both character and word-based ones. The embedding module in Pytorch comes in handy in this case

In [19]:
# we will deploy a character-level CNN
# We will also train in batches.. The problem with text is that not all datapoints will be of same length.. This would make 
# it difficult to fit them in batches due to size mismatch.
# What we will do is set a max-length in terms of number of characters for each datapoint
# If the text is shorter in length than this max-length we will pad with zeros
# For the text with longer in length, the characters exceeding this max-length will be removed
# The same techniqe could be adopted if you intend to deploy a word-level architecture.
# The max-length would then be defined in terms of number of words

In [26]:
string.printable #list of all characters which will form our vocabulary

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [204]:
class SST_data(Dataset):
    def __init__(self, train_sents, train_labels, max_length=512):
        self.data = train_sents
        self.labels = train_labels
        self.max_length = max_length
        self.length = len(train_labels)
        self.vocabulary = string.printable
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, item):
        text = self.data[item]
        label = self.labels[item]
        inp = [self.vocabulary.find(c) + 1 for c in text] 
        # replace each chacter in the text with
        # index 0 will be used as a special token for padding hence the indexing starts from 1
        if len(inp)<=self.max_length:
            inp.extend([0 for _ in range(self.max_length - len(inp))])
        else:
            inp = inp[:self.max_length]
        
        inp = torch.tensor(inp, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        return inp, label

In [167]:
train_data = SST_data(train_sents, train_labels)
test_data = SST_data(test_sents, test_labels)
val_data = SST_data(val_sents, val_labels)

# Architecture

<img src="Images/CNN_text_classification.png">

In [209]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_size, filters=64, max_len=512):
        super(Classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)
        self.conv = nn.Conv2d(in_channels=1, out_channels=filters, kernel_size=(kernel_size, embedding_dim))
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(max_len-kernel_size+1)
        self.linear = nn.Linear(filters, output_size)
        self.sigmoid = nn.LogSigmoid()
    
    def forward(self, inp):
        x = self.embedding(inp)
        x = x.unsqueeze(1)
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x.squeeze(3))
        x = x.squeeze(2)
        return  self.sigmoid(self.linear(x))

In [191]:
clf = Classifier(100, 200, 2, 3)

# Training and evaluation

In [192]:
def train(clf, train_data, val_data, epochs=10, learning_rate=0.0001):
    optimizer = opt.Adam(clf.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    for _ in range(epochs):
        
        for data, labels in DataLoader(train_data, batch_size=32, shuffle=True):
            out = clf(data)
            loss = criterion(out, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        score = evaluate(clf, val_data)
        print(f"Validation accuracy at epoch {_}: {score}")    
            

In [193]:
train(clf, train_data, val_data)

Validation accuracy at epoch 0: 0.6148955495004541
Validation accuracy at epoch 1: 0.6403269754768393
Validation accuracy at epoch 2: 0.6512261580381471
Validation accuracy at epoch 3: 0.662125340599455
Validation accuracy at epoch 4: 0.6584922797456857
Validation accuracy at epoch 5: 0.659400544959128
Validation accuracy at epoch 6: 0.6793823796548593
Validation accuracy at epoch 7: 0.6821071752951862
Validation accuracy at epoch 8: 0.6911898274296094
Validation accuracy at epoch 9: 0.6920980926430518


In [184]:
def evaluate(clf, test_data):
    
    true_labels = []
    inf_labels = []
    
    for data, labels in DataLoader(test_data, batch_size=32):
        out = clf(data)
        cls = torch.argmax(F.softmax(out, dim=1), dim=1)
        inf_labels.extend(cls.detach().numpy().tolist())
        true_labels.extend(labels.numpy().tolist())
    
    return accuracy_score(true_labels, inf_labels)

In [194]:
evaluate(clf, test_data)

0.6705882352941176

# Architecture with different window sizes

In [195]:
from nltk.tokenize import word_tokenize

In [199]:
def create_vocab(train_sents): # if we encounter a new word in test/val set we will replace it by <unk> token
    # index 0 -> padding
    # index 1 -> <unk>
    w2i = {}
    w2i['UNK'] = 1 # mapping each word to a unique id
    
    index = 2
    
    for sent in train_sents:
        words = word_tokenize(sent)
        for w in words:
            if w not in w2i:
                w2i[w] = index
                index+=1
    return w2i            

In [200]:
w2i = create_vocab(train_sents)

In [202]:
vocab_size = len(w2i)+1

In [203]:
vocab_size

18269

In [206]:
class SST_data(Dataset):
    def __init__(self, train_sents, train_labels, max_length=128):
        self.data = train_sents
        self.labels = train_labels
        self.max_length = max_length
        self.length = len(train_labels)
        self.vocabulary = string.printable
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, item):
        text = self.data[item]
        label = self.labels[item]
        inp = [w2i[w] if w in w2i else w2i['UNK'] for w in word_tokenize(text)] 
        if len(inp)<=self.max_length:
            inp.extend([0 for _ in range(self.max_length - len(inp))])
        else:
            inp = inp[:self.max_length]
        
        inp = torch.tensor(inp, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        return inp, label

In [207]:
train_data = SST_data(train_sents, train_labels)
test_data = SST_data(test_sents, test_labels)
val_data = SST_data(val_sents, val_labels)

<img src="Images/CNN_text_classification_2.png">

In [240]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_size = [2,3,4], filters=2, max_len=128):
        super(Classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=filters, kernel_size=(kernel_size[0], embedding_dim))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=filters, kernel_size=(kernel_size[1], embedding_dim))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=filters, kernel_size=(kernel_size[2], embedding_dim))
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool1d(max_len-kernel_size[0]+1)
        self.pool2 = nn.MaxPool1d(max_len-kernel_size[1]+1)
        self.pool3 = nn.MaxPool1d(max_len-kernel_size[2]+1)
        self.linear = nn.Linear(len(kernel_size)*filters, output_size)
        self.sigmoid = nn.LogSigmoid()
    
    def forward(self, inp):
        
        x1 = self.pool1(self.relu(self.conv1(self.embedding(inp).unsqueeze(1))).squeeze(3)).squeeze(2)
        x2 = self.pool2(self.relu(self.conv2(self.embedding(inp).unsqueeze(1))).squeeze(3)).squeeze(2)
        x3 = self.pool3(self.relu(self.conv3(self.embedding(inp).unsqueeze(1))).squeeze(3)).squeeze(2)
        out = torch.cat((x1, x2, x3),dim=1)
        return  self.sigmoid(self.linear(out))

In [241]:
clf = Classifier(vocab_size, 200, 2)

In [243]:
train(clf, train_data, val_data)

Validation accuracy at epoch 0: 0.6049046321525886
Validation accuracy at epoch 1: 0.6103542234332425
Validation accuracy at epoch 2: 0.6121707538601272
Validation accuracy at epoch 3: 0.6176203451407811
Validation accuracy at epoch 4: 0.6221616712079927
Validation accuracy at epoch 5: 0.6330608537693007
Validation accuracy at epoch 6: 0.6412352406902816
Validation accuracy at epoch 7: 0.6421435059037239
Validation accuracy at epoch 8: 0.6412352406902816
Validation accuracy at epoch 9: 0.6485013623978202
