# Text classification
## Recurrent Neural Networks
## LSTM
## Gated Recurrent Units
## biLSTM


## Embeddings

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim


In [47]:
cat_mat_embed = nn.Embedding(5,2)
cat_tensor = torch.LongTensor([1]) # position of word "cat" in dictionary
cat_mat_embed.forward(cat_tensor)


tensor([[ 0.0719, -1.2779]], grad_fn=<EmbeddingBackward0>)

## Tweets Sentiment Analysis

In [48]:
import pandas as pd

In [49]:
tweetsDF = pd.read_csv("tweet-data/training.1600000.processed.noemoticon.csv", engine="python", encoding="latin-1", header=None)

In [50]:
tweetsDF.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [51]:
tweetsDF[0].value_counts()

0
0    800000
4    800000
Name: count, dtype: int64

In [52]:
tweetsDF["sentiment_cat"] = tweetsDF[0].astype('category')

In [53]:
tweetsDF["sentiment"] = tweetsDF["sentiment_cat"].cat.codes # to make class e.g. (0,4,5) to (0,1,2)

In [54]:
tweetsDF.to_csv("tweet-data/train-processed.csv", header=None, index=None)


In [55]:
tweetsDF.sample(10000).to_csv("tweet-data/train-processed-sample.csv", header=None, index=None)


### torchtext

In [56]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

COLS = ["score", "id", "date", "query", "name", "tweet", "category", "label"]
df = pd.read_csv("tweet-data/train-processed.csv", header=None, names=COLS, encoding_errors="ignore")
df = df[["tweet", "label"]].dropna().reset_index(drop=True)

In [66]:
tokenizer = get_tokenizer('spacy')

class TweetDS(Dataset):
    def __init__(self, frame):
        self.frame = frame.reset_index(drop=True)
    def __len__(self):
        return len(self.frame)
    def __getitem__(self, idx):
        row = self.frame.iloc[idx]
        return str(row["tweet"]), int(row["label"])
        
full_ds = TweetDS(df)
N = len(full_ds)
n_train = int(0.8 * N)
n_valid = int(0.1 * N)
n_test = N - n_train - n_valid
train_ds, valid_ds, test_ds = random_split(full_ds, [n_train, n_valid, n_test])
len(train_ds), len(valid_ds), len(test_ds)





(1280000, 160000, 160000)

In [77]:
# tok("Hello World!") => ["hello", "world", "!"]
def tok(text):
    return [t.lower() for t in tokenizer(text)]
def yield_tokens(ds):
    for text, label in ds:
        yield tok(text)
# vocab: {'<unk>':0, '<pad>':1, 'hello':2, 'world':3, '!':4, 'i':5, 'am':6, 'hungry':7, ...}
vocab = build_vocab_from_iterator(yield_tokens(train_ds), specials=["<unk>", "<pad>"], max_tokens=20002)
vocab.set_default_index(vocab["<unk>"])


In [80]:
# text_pipeline("Hello World!") => [2,3,5] (index in dict)
def text_pipeline(x: str):
    return [vocab[token] for token in tok(x)]

# batch: [ ("Hello World!", 1), ("I am hungry", 0) ]
def collate(batch):
    xs, ys = zip(*batch) # xs: ("Hello World!", "I am hungry"); ys: (1,0); 
    xs = [torch.tensor(text_pipeline(x), dtype=torch.long) for x in xs] # [(2,4), (3,5,6)], 
    xpad = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=vocab["<pad>"]) # [[2,4,1],[3,5,6]] with 1=<pad>
    y = torch.tensor(ys, dtype=torch.long).view(-1)
    return xpad.to(device), y.to(device)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate)
valid_loader = DataLoader(valid_ds, batch_size=32, shuffle=False, collate_fn=collate)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate)

In [81]:
print("Vocab size:", len(vocab))

Vocab size: 20002


In [82]:
# Most common words
from collections import Counter 
counter = Counter()
for text, _ in train_ds:
    counter.update(tok(text))
print(counter.most_common(10))

[('i', 797378), ('!', 723441), ('.', 646538), (' ', 469854), ('to', 452934), ('the', 417901), (',', 386468), ('a', 304651), ('my', 252921), ('it', 242787)]


## Creating our model

In [83]:
# Classifying tweets
class OurFirstLSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # [seq_len, vocab_size] => [seq_len, embedding_dim]
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.predictor = nn.Linear(hidden_size, 2)

    def forward(self, seq):
        output, (hidden, _) = self.encoder(self.embedding(seq)) # hidden: [1,batch,hidden]
        preds = self.predictor(hidden.squeeze(0)) # [hidden,] => [batch,2]
        return preds

device = "cuda" if torch.cuda.is_available() else "cpu"
model = OurFirstLSTM(100, 300, 20002)
model.to(device)

OurFirstLSTM(
  (embedding): Embedding(20002, 300)
  (encoder): LSTM(300, 100, batch_first=True)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

## Notice
### Binary: BCEWithLogits => our Model do NOT have sigmoid (it's inside Loss Function already)
### Binary: BCELoss => our Model DO HAVE sigmoid
### Multiclass: CrossEntropyLoss => our Model do NOT have log-softmax (it's inside the Loss Function already)
### Multiclass: NLLLoss => our Model MUST HAVE log-softmax

In [84]:
optimizer = optim.Adam(model.parameters(), lr=2e-2)
criterion = nn.CrossEntropyLoss()

def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):
    for epoch in range(1, epochs+1):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch_idx, (features, label) in enumerate(train_iterator):
            optimizer.zero_grad()
            predict = model(features)
            loss = criterion(predict, label)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * label.size(0)
        training_loss /= len(train_iterator.dataset)

        model.eval()
        with torch.no_grad():
            for batch_idx, (features, label) in enumerate(valid_iterator):
                predict = model(features)
                loss = criterion(predict, label)
                valid_loss += loss.data.item() * features.size(0)
        valid_loss /= len(valid_iterator.dataset)

        print("Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}".format(epoch, training_loss, valid_loss))
        

In [85]:
def classify_tweet(tweet):
    model.eval()
    with torch.no_grad():
        ids = torch.tensor([text_pipeline(tweet)], dtype=torch.long, device=device)
        logits = model(ids)
        pred = int(torch.argmax(logits, dim=1).item())
    categories = {0: "Negative", 1: "Positive"}
    return categories[pred]

In [86]:
train(5, model, optimizer, criterion, train_loader, valid_loader)

Epoch: 1, Training Loss: 0.66, Validation Loss: 0.65
Epoch: 2, Training Loss: 0.66, Validation Loss: 0.65
Epoch: 3, Training Loss: 0.66, Validation Loss: 0.67
Epoch: 4, Training Loss: 0.67, Validation Loss: 0.65
Epoch: 5, Training Loss: 0.67, Validation Loss: 0.66


In [87]:
print(classify_tweet("I love this!"))

Positive


In [88]:
print(classify_tweet("This is terrible"))

Negative
