### 作業目的: 熟練自定義collate_fn與sampler進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [1]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')

Mounted at /content/gdrive


### 探索資料與資料前處理
這份作業我們使用test資料中的pos與neg


In [3]:
# 讀取字典，這份字典為review內所有出現的字詞
df_vocab = pd.read_csv('./gdrive/My Drive/nlp2_colab/data/aclImdb/imdb.vocab', names=['voc'], encoding="latin")
vocab = df_vocab.voc.tolist()

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")
vocab=[word for word in vocab if not word in set(stopwords.words('english'))]
print(f"vocab length after removing stopwords: {len(vocab)}")

# 將字典轉換成dictionary
vocab_list = list(set(vocab))
vocab_list = [x for x in vocab_list if str(x) != 'nan']
vocab_dic = {}
for idx, word in enumerate(vocab_list):
  vocab_dic[word] = idx

vocab length before removing stopwords: 89527
vocab length after removing stopwords: 89356


In [4]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度
path = './gdrive/My Drive/nlp2_colab/data/aclImdb/train/'
review_pos = glob.glob(path + "pos/*.txt")
review_neg = glob.glob(path + "neg/*.txt")
review_all = review_pos + review_neg
y = [1]*len(review_pos) + [0]*len(review_neg)

review_pairs = list(zip(review_all, y))
print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('./gdrive/My Drive/nlp2_colab/data/aclImdb/train/pos/11414_9.txt', 1), ('./gdrive/My Drive/nlp2_colab/data/aclImdb/train/pos/11609_10.txt', 1)]
Total reviews: 25000


### 建立Dataset, DataLoader, Sampler與Collate_fn讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量函式
(generate_vec)，注意這裡我們用來產生詞向量的方法是單純將文字tokenize(為了使產生的文本長度不同，而不使用BoW)

In [5]:
def load_review(review_path):
    
  with open(review_path, 'r') as f:
    review = f.read()
        
  #移除non-alphabet符號、贅字與tokenize
  review = re.sub('[^a-zA-Z]',' ',review)
  review = nltk.word_tokenize(review)
  review = list(set(review).difference(set(stopwords.words('english'))))
  
  return review
  
def generate_vec(review, vocab_dic):
  doc_vec = []
  for word in review:
    if vocab_dic.get(word):
      doc_vec.append(vocab_dic.get(word))
            
  return torch.tensor(doc_vec)

In [6]:
#建立客製化dataset

class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_dir, vocab):
      self.data_dir = data_dir
      self.vocab = vocab

    def __len__(self):
      return len(self.data_dir)

    def __getitem__(self, idx):
      pair = self.data_dir[idx]
      review = pair[0]
      review = load_review(review)
      review = generate_vec(review, self.vocab)
        
      return review, pair[1]  

#建立客製化collate_fn，將長度不一的文本pad 0 變成相同長度
def collate_fn(batch):

  corpus, labels = zip(*batch) 
    
  ### create pads for corpus ###
  lengths = [len(x) for x in corpus]
  max_length = max(lengths)
    
  batch_corpus = []
    
  for i in range(len(corpus)):
    # pad corpus
    tmp_pads = torch.zeros(max_length)
    tmp_pads[:lengths[i]] = corpus[i]
    tmp_pads.view(-1, 1)
    batch_corpus.append(tmp_pads.view(1,-1))

  return torch.cat(batch_corpus,dim=0), torch.tensor(labels) , torch.tensor(lengths)

In [7]:
# 使用Pytorch的RandomSampler來進行indice讀取並建立dataloader
custom_dst = dataset(review_pairs, vocab_dic)
custom_dataloader = DataLoader(dataset=custom_dst, batch_size=4, shuffle=True, collate_fn=collate_fn)
next(iter(custom_dataloader))

(tensor([[16510., 70393.,  3910., 23364., 25536., 75150., 29338., 14753., 47747.,
          62957., 30014., 80573., 10593.,  1825., 79515., 10435., 32776., 36519.,
          41612., 85239., 27325., 76288., 61755., 77846., 72604., 49511., 30070.,
          39989., 35338., 88914., 18731., 75954., 89069., 61045., 10312., 65968.,
           8035., 65633., 85940.,  3460., 39830., 81911.,  4903., 73164., 87549.,
          27903., 66847., 68762., 16436., 33201.,  7130.,  5955.,  8604., 46500.,
          11386., 72019., 28425., 34523., 17049., 45585., 51196., 63782., 67075.,
          80352., 77760., 16287., 73197., 80162., 38852., 76388., 77422., 10398.,
            691.,     0.,     0.,     0.,     0.,     0.,     0.,     0.],
         [44233., 50721., 16903., 48090., 75150.,  3196., 44432., 48447.,  4299.,
          21009., 36184., 34948., 65417., 36519., 12499., 19252., 64602., 20318.,
          85407., 66270., 49307., 86429., 41613., 76288., 43934., 20505., 72604.,
          16393., 21862