# Text classification
## Recurrent Neural Networks
## LSTM
## Gated Recurrent Units
## biLSTM


## Embeddings

In [1]:
import torch
import torch.nn as nn


In [2]:
cat_mat_embed = nn.Embedding(5,2)
cat_tensor = torch.LongTensor([1]) # position of word "cat" in dictionary
cat_mat_embed.forward(cat_tensor)


tensor([[ 2.2104, -0.4286]], grad_fn=<EmbeddingBackward0>)

## Tweets

In [3]:
import pandas as pd

In [4]:
tweetsDF = pd.read_csv("tweet-data/training.1600000.processed.noemoticon.csv", engine="python", encoding="latin-1", header=None)

In [7]:
tweetsDF.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
tweetsDF[0].value_counts()

0
0    800000
4    800000
Name: count, dtype: int64

In [9]:
tweetsDF["sentiment_cat"] = tweetsDF[0].astype('category')

In [10]:
tweetsDF["sentiment"] = tweetsDF["sentiment_cat"].cat.codes # to make class e.g. (0,4,5) to (0,1,2)

In [11]:
tweetsDF.to_csv("tweet-data/train-processed.csv", header=None, index=None)


In [13]:
tweetsDF.sample(10000).to_csv("tweet-data/train-processed-sample.csv", header=None, index=None)


### torchtext

In [27]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

COLS = ["score", "id", "date", "query", "name", "tweet", "category", "label"]
df = pd.read_csv("tweet-data/train-processed.csv", header=None, names=COLS, encoding_errors="ignore")
df = df[["tweet", "label"]].dropna().reset_index(drop=True)



In [30]:
tokenizer = get_tokenizer('spacy')

class TweetDS(Dataset):
    def __init__(self, frame):
        self.frame = frame.reset_index(drop=True)
    def __len__(self):
        return len(self.frame)
    def __getitem__(self, idx):
        row = self.frame.iloc[idx]
        return int(row["label"]), str(row["tweet"])
        
full_ds = TweetDS(df)
N = len(full_ds)
n_train = int(0.8 * N)
n_valid = int(0.1 * N)
n_test = N - n_train - n_valid
train_ds, valid_ds, test_ds = random_split(full_ds, [n_train, n_valid, n_test])
len(train_ds), len(valid_ds), len(test_ds)



(1280000, 160000, 160000)

In [36]:
# tok("Hello World!") => ["hello", "world", "!"]
def tok(text):
    return [t.lower() for t in tokenizer(text)]
def yield_tokens(ds):
    for label, text in ds:
        yield tok(text)
# vocab: {'<unk>':0, '<pad>':1, 'hello':2, 'world':3, '!':4, 'i':5, 'am':6, 'hungry':7, ...}
vocab = build_vocab_from_iterator(yield_tokens(train_ds), specials=["<unk>", "<pad>"], max_tokens=20002)


In [37]:
# text_pipeline("Hello World!") => [2,3,5] (index in dict)
def text_pipeline(x: str):
    return [vocab[token] for token in tok(x)]

# batch: [ (1, "Hello World!"), (0, "I am hungry") ]
def collate(batch):
    ys, xs = zip(*batch) # ys: (1,0); xs: ("Hello World!", "I am hungry")
    xs = [torch.tensor(text_pipeline(x), dtype=torch.long) for x in xs] # [(2,4), (3,5,6)], 
    xpad = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=vocab["<pad>"]) # [[2,4,1],[3,5,6]] with 1=<pad>
    y = torch.tensor(ys, dtype=torch.long)
    return xpad.to(device), y.to(device)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate)
valid_loader = DataLoader(valid_ds, batch_size=32, shuffle=False, collate_fn=collate)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate)

In [38]:
print("Vocab size:", len(vocab))

Vocab size: 20002


In [40]:
# Most common words
from collections import Counter 
counter = Counter()
for _, text in train_ds:
    counter.update(tok(text))
print(counter.most_common(10))

[('i', 797730), ('!', 723285), ('.', 646865), (' ', 470008), ('to', 452260), ('the', 417411), (',', 386025), ('a', 304317), ('my', 252864), ('it', 242787)]
