### Dataset
data目录下有两种数据集:

glue-sst2: 给定一个句子判定这个句子的情感, 是positive和negative

wikitext: wiki百科优质文章

### wikitext in LSTM

In [None]:
# 代码上传到了google drive，需要挂载并cd到指定目录
from google.colab import drive
drive.mount('/content/drive')
import os

%cd /content/drive/MyDrive/nlp-project/word_language_model/
%ll

Mounted at /content/drive
/content/drive/MyDrive/nlp-project/word_language_model
total 55258
drwx------ 2 root     4096 Jan  7 13:44 [0m[01;34mdata[0m/
-rw------- 1 root     2829 Jan 14 03:41 data.py
-rw------- 1 root   600536 Jan 14 10:51 id_mapping.pkl
-rw------- 1 root    10439 Jan 14 10:51 main.py
-rw------- 1 root 55955216 Jan 14 03:58 model.pt
-rw------- 1 root     2839 Jul 15  2022 model.py
drwx------ 2 root     4096 Jan  7 14:05 [01;34m__pycache__[0m/
-rw------- 1 root     2094 Jul 15  2022 README.md


#### 训练

In [None]:
# 使用默认参数
%pwd
!python main.py --data './data/wikitext' --cuda --epochs 20

./data/wikitext/train.txt
| epoch   1 |   200/ 2983 batches | lr 20.00 | ms/batch 13.79 | loss  7.64 | ppl  2070.90
| epoch   1 |   400/ 2983 batches | lr 20.00 | ms/batch  7.77 | loss  6.86 | ppl   950.48
| epoch   1 |   600/ 2983 batches | lr 20.00 | ms/batch  7.71 | loss  6.49 | ppl   661.52
| epoch   1 |   800/ 2983 batches | lr 20.00 | ms/batch  7.73 | loss  6.30 | ppl   546.19
| epoch   1 |  1000/ 2983 batches | lr 20.00 | ms/batch  7.71 | loss  6.15 | ppl   468.81
| epoch   1 |  1200/ 2983 batches | lr 20.00 | ms/batch  7.75 | loss  6.07 | ppl   431.42
| epoch   1 |  1400/ 2983 batches | lr 20.00 | ms/batch  7.77 | loss  5.96 | ppl   387.33
| epoch   1 |  1600/ 2983 batches | lr 20.00 | ms/batch  7.74 | loss  5.96 | ppl   386.70
| epoch   1 |  1800/ 2983 batches | lr 20.00 | ms/batch  7.75 | loss  5.81 | ppl   332.83
| epoch   1 |  2000/ 2983 batches | lr 20.00 | ms/batch  7.75 | loss  5.78 | ppl   325.12
| epoch   1 |  2200/ 2983 batches | lr 20.00 | ms/batch  7.74 | loss  5.67

#### 测试模型的效果

In [2]:
# 挂载磁盘
# 代码上传到了google drive，需要挂载并cd到指定目录
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/nlp-project/word_language_model/
%ll

Mounted at /content/drive
/content/drive/MyDrive/nlp-project/word_language_model
total 55258
drwx------ 2 root     4096 Jan  7 13:44 [0m[01;34mdata[0m/
-rw------- 1 root     2829 Jan 14 03:41 data.py
-rw------- 1 root   600536 Jan 14 10:53 id_mapping.pkl
-rw------- 1 root    10439 Jan 14 10:51 main.py
-rw------- 1 root 55955216 Jan 14 11:01 model.pt
-rw------- 1 root     2839 Jul 15  2022 model.py
drwx------ 2 root     4096 Jan  7 14:05 [01;34m__pycache__[0m/
-rw------- 1 root     2094 Jul 15  2022 README.md


In [11]:
# predict
import os
import pickle
import torch
import torch.nn.functional as F

device = torch.device("cuda")

def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def predict(input:str):
  # 设置seed和GPU环境


  # load id_mapping文件
  with open('./id_mapping.pkl', 'rb') as f:
    dic = pickle.load(f)

  # load模型
  with open('model.pt', 'rb') as f:
    model = torch.load(f)
    model.rnn.flatten_parameters()
    model.to(device)

  # 处理输入文本
  batch_size = 1
  text = input.split(" ")
  bptt = len(text)
  # 需要将未出现在词汇表中的词汇定义为<unk>
  text = [dic.word2idx[word] if word in dic.word2idx else dic.word2idx['<unk>'] for word in text]
  text_tensor = torch.tensor(text)

  data = batchify(text_tensor, batch_size)

  # 预测，并打印topN的词汇
  model.eval()
  hidden = model.init_hidden(batch_size)
  pred, hidden = model(data, hidden)
  pred = F.softmax(pred, dim=1)

  last_row = pred[-1, :]
  top_values, top_indices = torch.topk(last_row, k=10)
  pred = top_indices
  tensor_list = pred.tolist()
  for index in tensor_list:
    for key,value in dic.word2idx.items():
      if index == value:
        print(key)

input = "never too late to"
predict(input)

do
be
the
make
<unk>
play
reach
use
take
have


## glue-sst2 in LSTM

In [None]:
# 可以使用Tokenizer来处理文本
import torch
input = torch.randn(3, 5, requires_grad=True)
print(input)

tensor([[ 1.1418, -0.0427,  0.3615, -0.1207, -1.2194],
        [-1.7356, -0.3106, -0.0264, -1.0470, -2.4602],
        [-1.5302, -0.4010,  0.0132, -1.1681,  0.2779]], requires_grad=True)
