## Today we will speak about text classification

**We will detect polarity of movie reviews, negative or positive.**

In [None]:
import requests
import urllib
from os.path import dirname, abspath, join, exists
import os
import tarfile
import argparse
from zipfile import ZipFile

In [None]:
# Download dataset.
dataset_dir = 'data/'
if not exists(dataset_dir):
    os.mkdir(dataset_dir)
dataset_url = 'https://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/rt-polaritydata.tar.gz'
tar_file = 'rt-polaritydata.tar.gz'
tar_filepath = join(dataset_dir, tar_file)
urllib.request.urlretrieve(dataset_url, filename=tar_filepath)
with tarfile.open(tar_filepath, "r") as tar:
    tar.extractall(dataset_dir)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
neg_filepath = join(dataset_dir,'rt-polaritydata', 'rt-polarity.neg')
pos_filepath = join(dataset_dir,'rt-polaritydata', 'rt-polarity.pos')

In [None]:
import nltk
tokenizer = nltk.tokenize.WordPunctTokenizer()

In [None]:
# May be you will need to change chmod to nltk_data folder
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
def clean_tokens(tokens):
    result = []
    for token in tokens:
        if token.isalpha() and not token in stop_words:
            result.append(token)
    return result

In [None]:
sentences = []
with open(pos_filepath, 'r', errors='ignore') as pos:
    for line in pos:
        words = clean_tokens(tokenizer.tokenize(line.lower()))
        sentences.append((words, 1))
with open(neg_filepath, 'r', errors='ignore') as neg:
    for line in neg:
        words = clean_tokens(tokenizer.tokenize(line.lower()))
        sentences.append((words, 0))
        
train_data, test_data = train_test_split(sentences, test_size=0.1)
train_data, val_data = train_test_split(train_data, test_size=0.2)
n_classes = 2

In [None]:
len(train_data), len(test_data)

In [None]:
train_data[3]

**Task: watch balance of classes.**

Not all words are equally useful. Some of them are typos or rare words that are only present a few times. 
​
Let's count how many times is each word present in the data so that we can build a "white list" of known words.

In [None]:
from collections import Counter

In [None]:
word_list = []
for tokens in train_data:
    word_list += tokens[0]
for tokens in test_data:
    word_list += tokens[0]

In [None]:
len(word_list)

**Well, there are quiet few words, so we do not need a complex model. In fact statistical models, for example TF-IDF may work better than NN on this task.**

In [None]:
token_counts = Counter(word_list)

In [None]:
print("Total unique tokens :", len(token_counts))
print('\n'.join(map(str, token_counts.most_common(n=5))))
print('...')
print('\n'.join(map(str, token_counts.most_common()[-3:])))

In [None]:
# Let's see how many words are there for each count
plt.hist(list(token_counts.values()), range=[0, 10**2], bins=50, log=True)
plt.xlabel("Word counts");

In [None]:
min_count = 5

# tokens from token_counts keys that had at least min_count occurrences throughout the dataset
tokens = [token for token, n in token_counts.items() if n >= min_count]

In [None]:
len(tokens)

In [None]:
# Add a special tokens for unknown and empty words
UNK, PAD = "UNK", "PAD"
tokens = [UNK, PAD] + sorted(tokens)
print("Vocabulary size:", len(tokens))

In [None]:
token_to_id = {}
for i, token in enumerate(tokens):
    token_to_id[token] = i

In [None]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
        
    max_len = min(max(map(len, sequences)), max_len or float('inf'))
    
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))
    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [None]:
train_sentences = []
train_targets = []
for sent, target in train_data:
    train_sentences.append(sent)
    train_targets.append(target)

In [None]:
# Let's spare RAM.
del train_data

In [None]:
test_sentences = []
test_targets = []
for sent, target in test_data:
    test_sentences.append(sent)
    test_targets.append(target)

In [None]:
del test_data

In [None]:
def make_batch(sentences, targets, max_len=None, word_dropout=0):
    """
    Creates a keras-friendly dict from the batch data.
    """
    batch = {}
    batch["text"] = as_matrix(sentences, max_len)
    batch['target'] = np.array(targets)
    
    return batch

In [None]:
make_batch(train_sentences[:10], train_targets[:10], max_len=10)

**Let's do the architecture**

In [None]:
import torch
from torch import nn

In [None]:
class ConvNet(nn.Module):
    def __init__(self,n_tokens=len(tokens), emb_size=20, kernel_sizes=[3,4]):
        super().__init__()
        ### YOUR CODE HERE
    
    def forward(self,batch):
        embeddings = self.embeddings(torch.LongTensor(batch['text']))
        embeddings = embeddings.transpose(1,2) # (batch_size, wordvec_size, sentence_length)
        
        feature_list = []
        for conv in self.conv_modules:
            ### YOUR CODE HERE
        ### YOUR CODE HERE
        features = torch.cat(feature_list, dim=1)
        features = self.drop(features)
        linear = self.linear(features)
        return linear
    
    def predict(self, batch):
        return self.softmax(self.forward(batch))

In [None]:
model = ConvNet()

In [None]:
model.forward(make_batch(train_sentences[:10], train_targets[:10], max_len=10))

In [None]:
optimizer = torch.optim.Adam(model.parameters())

In [None]:
from tqdm import tqdm_notebook
import random
batch_size=25
dataset_arange = np.arange(len(train_sentences))
num_iters = 2000

In [None]:
from torch.autograd import Variable

In [None]:
train_sentences = np.array(train_sentences)
train_targets = np.array(train_targets)

In [None]:
train_sentences.shape[0]

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
test_batch = make_batch(test_sentences, test_targets, max_len=10)

In [None]:
losses_train = []
losses_test = []
for i in tqdm_notebook(range(num_iters)):
    
    optimizer.zero_grad()
    index = np.random.choice(dataset_arange, size=batch_size)
    batch = make_batch(train_sentences[index], train_targets[index], max_len=10)
    ### YOUR CODE HERE
    loss.backward()
    if (i+10) % 100 == 0:
        losses_train.append(float(loss))
        output = model.forward(test_batch)
        test_loss = loss_fn(output, Variable(torch.LongTensor(test_batch['target'])))
        losses_test.append(float(test_loss))
    if (i+10) % 500 == 0:
        print("Train loss: ", losses_train[-1])
        print("Test loss: ", losses_test[-1])
    optimizer.step()

In [None]:
plt.scatter(np.arange(len(losses_train)), losses_train)
plt.scatter(np.arange(len(losses_test)), losses_test)

In [None]:
model.predict(make_batch(test_sentences[:10], test_targets[:10], max_len=10))

In [None]:
predictions = model.predict(make_batch(test_sentences, test_targets, max_len=10))

In [None]:
predictions = np.argmax(predictions.detach().numpy(), axis=1)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(test_targets, predictions)

**Well, it is bad, in the original paper about CNNs https://arxiv.org/pdf/1408.5882.pdf the author achieves nearly 80 percents.**

**Hometask: Achive good accuracy with CNN models.**

## Homework, project part.

### I want you to choose one topic:
#### Each task should be a class in python3, you can choose all over stuff by yourself.

#### 1) Write function that will remove words one by one, measure metric changing after that and mark words.
#### 2) Finding inputs that maximize/minimize activation of some chosen neurons (_read more [on distill.pub](https://distill.pub/2018/building-blocks/)_)
#### 3)Building local linear approximations to your neural network: [article](https://arxiv.org/abs/1602.04938), [eli5 library](https://github.com/TeamHG-Memex/eli5/tree/master/eli5/formatters)

#### 4) Any article you would like from https://github.com/blackboxnlp/blackboxnlp.github.io/blob/master/program.md
#### 5) You can come up with your ideas too.

### Feel free to discuss your ideas or problems with me via email or telegram.

#### Almost all of this staff already exists in code, you need only to understand it and change it according your view.