# RNN, Transformer & NLP

- Basics: word embeddings

    - Word2Vec, FastText, GloVe

- Sequence-to-sequence and autoregressive models

    - Classifying Names with a Character-Level RNN
    - Generating Names with a Character-Level RNN

- Self-attention and transformer models
    - Translation with a Sequence to Sequence Network and Attention
- Vision Transformers

In [None]:
import os
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from collections import Counter
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from IPython.display import clear_output
import matplotlib.pyplot as plt
import numpy as np

## 1. Basics

### 1.1. Text preprocessing pipelines

- **分词 (Tokenization)**: 将文本拆分成单词或"词元"(**tokens**) 
- **子词分词器 (Subword tokenizers)**: 以子词单位进行灵活切分, 如 "unbelievability" 可切成 un、believ、abil、ity
- **词干提取 (Stemming)**: 对单词进行简单的还原到词干, 例如将 "the meeting" 变成 "the meet"
- **词形还原 (Lemmatization)**: NLP-based reduction, e.g. distinguishes between nouns and verbs (区分名词和动词等词性)
- **去除停用词 (Discard stop words)** :去掉如 "the"、"an" 等常见但无实际意义的词
- **Useful libraries**: [nltk](https://www.nltk.org/), [spaCy](https://spacy.io/), [gensim](https://radimrehurek.com/gensim/), [HuggingFace tokenizers](https://huggingface.co/docs/tokenizers/index),...


### 1.2. Bag of word representation
- First, build a **vocabulary** of all occuring words. Maps every word to an index.
- Represent each document as an $N$ dimensional vector (top-$N$ most frequent words)
    - One-hot (sparse) encoding: 1 if the word occurs in the document
- Destroys the order of the words in the text (hence, a 'bag' of words)

<img src="https://raw.githubusercontent.com/ML-course/master/master/notebooks/images/bag_of_words.png" alt="ml" style="width: 60%"/>

#### 1.2.1. Neural networks on bag of words
- We can build neural networks on bag-of-word vectors
    - Do a one-hot-encoding with 10000 most frequent words
    - Simple model with 2 dense layers, ReLU activation, dropout
    - Using **IMDB dataset** of movie reviews (label is `'positive'` or `'negative'`)
    
``` python
self.model = nn.Sequential(
    nn.Linear(10000, 16),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(16, 16),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(16, 1)
)
```

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


In [2]:
import string
# 使用nltk库删去停用词
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


# 加载IMDB数据集
data_dir = '../Data/IMDB'

def read_imdb(data_dir, set_type='train'):
    texts = []
    labels = []
    for label_type in ['pos', 'neg']:
        label = 1 if label_type == 'pos' else 0
        dir_path = os.path.join(data_dir, set_type, label_type)  # 拼接路径
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt'):
                with open(os.path.join(dir_path, filename), 'r', encoding='utf-8') as file:
                    text = file.read()
                    texts.append(text)
                    labels.append(label)
    return texts, labels

train_data, train_label = read_imdb(set_type='train')
test_data, test_label = read_imdb(set_type='test')

# 构建词表

def build_vocab(max_vocab_size=None, train_text = train_data):
    counter = Counter()    # 计数器的类, 用于记录各种单词的出现次数
    for text in train_text:
        text = text.lower()  # 转换为小写
        text = text.translate(str.maketrans("", "", string.punctuation))  # 去除标点符号
        tokens = text.split()  # 分词   
        filtered_tokens = [token for token in tokens if token not in stop_words]
        counter.update(filtered_tokens)  # 计数(出现频率)
    voc = [word for word, _ in counter.most_common(max_vocab_size)]  # 提取出现频率最高的
    return voc


vocab = build_vocab(max_vocab_size=10000)

[nltk_data] Error loading stopwords: Remote end closed connection
[nltk_data]     without response


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\86188/nltk_data'
    - 'd:\\Anaconda\\envs\\dsml\\nltk_data'
    - 'd:\\Anaconda\\envs\\dsml\\share\\nltk_data'
    - 'd:\\Anaconda\\envs\\dsml\\lib\\nltk_data'
    - 'C:\\Users\\86188\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
# Load data with top 10,000 words
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

# Vectorize sequences into one-hot encoded vectors
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension), dtype=np.float32)
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.0
    return results

# One-hot encode
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

class IMDBVectorizedDataset(Dataset):
    def __init__(self, features, labels):
        self.x = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
# Validation split like in Keras: first 10k for val
x_val, x_partial_train = x_train[:10000], x_train[10000:]
y_val, y_partial_train = y_train[:10000], y_train[10000:]

train_dataset = IMDBVectorizedDataset(x_partial_train, y_partial_train)
val_dataset = IMDBVectorizedDataset(x_val, y_val)
test_dataset = IMDBVectorizedDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512)
test_loader = DataLoader(test_dataset, batch_size=512)

class LivePlotCallback(pl.Callback):
    def __init__(self):
        self.train_losses = []
        self.train_accs = []
        self.val_losses = []
        self.val_accs = []
        self.max_acc = 0

    def on_train_epoch_end(self, trainer, pl_module):
        metrics = trainer.callback_metrics

        train_loss = metrics.get("train_loss")
        train_acc = metrics.get("train_acc")
        val_loss = metrics.get("val_loss")
        val_acc = metrics.get("val_acc")

        if all(v is not None for v in [train_loss, train_acc, val_loss, val_acc]):
            self.train_losses.append(train_loss.item())
            self.train_accs.append(train_acc.item())
            self.val_losses.append(val_loss.item())
            self.val_accs.append(val_acc.item())
            self.max_acc = max(self.max_acc, val_acc.item())

            if len(self.train_losses) > 1:
                clear_output(wait=True)
                N = np.arange(0, len(self.train_losses))
                plt.figure(figsize=(10, 4))
                plt.plot(N, self.train_losses, label='train_loss', lw=2, c='r')
                plt.plot(N, self.train_accs, label='train_acc', lw=2, c='b')
                plt.plot(N, self.val_losses, label='val_loss', lw=2, linestyle=":", c='r')
                plt.plot(N, self.val_accs, label='val_acc', lw=2, linestyle=":", c='b')
                plt.title(f"Training Loss and Accuracy [Max Val Acc: {self.max_acc:.4f}]", fontsize=12)
                plt.xlabel("Epoch", fontsize=12)
                plt.ylabel("Loss / Accuracy", fontsize=12)
                plt.tick_params(axis='both', labelsize=12)
                plt.legend(fontsize=12)
                plt.grid(True)
                plt.show()
            
class IMDBClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10000, 16)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(16, 16)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.sigmoid(self.fc3(x))
        return x.squeeze()

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.binary_cross_entropy(y_hat, y)
        acc = ((y_hat > 0.5) == y.bool()).float().mean()
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = F.binary_cross_entropy(y_hat, y)
        val_acc = ((y_hat > 0.5) == y.bool()).float().mean()
        self.log("val_loss", val_loss, on_epoch=True, prog_bar=True)
        self.log("val_acc", val_acc, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.RMSprop(self.parameters())
    
model = IMDBClassifier()
trainer = pl.Trainer(max_epochs=15, callbacks=[LivePlotCallback()], logger=False, enable_checkpointing=False)
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
