### 作業目的: 熟練Pytorch Dataset與DataLoader進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [1]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicole\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicole\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 探索資料與資料前處理
在train資料中，有分成pos(positive)與neg(negative)，分別為正評價與負評價，此評價即為label。

In [2]:
# 讀取字典，這份字典為review內所有出現的字詞
###<your code>###
with open('imdb.vocab','r',encoding='utf-8') as f:
    vocab=f.read()

vocab=vocab.strip()
# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")
###<your code>###
vocab=list(set(vocab).difference(set(stopwords.words('english'))))
print(f"vocab length after removing stopwords: {len(vocab)}")

# # 將字典轉換成dictionary
# ### <your code>###
vocab_dic= dict(zip(vocab, range(len(vocab))))

vocab length before removing stopwords: 845294
vocab length after removing stopwords: 69


In [3]:
vocab_dic

{'ó': 0,
 'î': 1,
 'è': 2,
 'c': 3,
 'ã': 4,
 '8': 5,
 'k': 6,
 'r': 7,
 'â': 8,
 '(': 9,
 '}': 10,
 '?': 11,
 'ô': 12,
 'h': 13,
 ']': 14,
 'ß': 15,
 'x': 16,
 'à': 17,
 '½': 18,
 '!': 19,
 "'": 20,
 'û': 21,
 'ä': 22,
 'º': 23,
 'l': 24,
 '¾': 25,
 '\n': 26,
 'í': 27,
 'ü': 28,
 'ř': 29,
 'q': 30,
 'w': 31,
 'á': 32,
 'g': 33,
 '[': 34,
 'ï': 35,
 '=': 36,
 'ø': 37,
 'ı': 38,
 'ñ': 39,
 'ì': 40,
 'e': 41,
 'ö': 42,
 'č': 43,
 'p': 44,
 ';': 45,
 'ë': 46,
 'u': 47,
 'æ': 48,
 '-': 49,
 'ò': 50,
 'é': 51,
 'z': 52,
 'å': 53,
 'ê': 54,
 'ú': 55,
 'ù': 56,
 'n': 57,
 'b': 58,
 ':': 59,
 'õ': 60,
 'ç': 61,
 ')': 62,
 'v': 63,
 'f': 64,
 'ð': 65,
 'ý': 66,
 'j': 67,
 'ō': 68}

In [4]:
set(vocab)

{'\n',
 '!',
 "'",
 '(',
 ')',
 '-',
 '8',
 ':',
 ';',
 '=',
 '?',
 '[',
 ']',
 'b',
 'c',
 'e',
 'f',
 'g',
 'h',
 'j',
 'k',
 'l',
 'n',
 'p',
 'q',
 'r',
 'u',
 'v',
 'w',
 'x',
 'z',
 '}',
 'º',
 '½',
 '¾',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'ì',
 'í',
 'î',
 'ï',
 'ð',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 'ø',
 'ù',
 'ú',
 'û',
 'ü',
 'ý',
 'č',
 'ı',
 'ō',
 'ř'}

In [22]:
vocab

['ó',
 'î',
 'è',
 'c',
 'ã',
 '8',
 'k',
 'r',
 'â',
 '(',
 '}',
 '?',
 'ô',
 'h',
 ']',
 'ß',
 'x',
 'à',
 '½',
 '!',
 "'",
 'û',
 'ä',
 'º',
 'l',
 '¾',
 '\n',
 'í',
 'ü',
 'ř',
 'q',
 'w',
 'á',
 'g',
 '[',
 'ï',
 '=',
 'ø',
 'ı',
 'ñ',
 'ì',
 'e',
 'ö',
 'č',
 'p',
 ';',
 'ë',
 'u',
 'æ',
 '-',
 'ò',
 'é',
 'z',
 'å',
 'ê',
 'ú',
 'ù',
 'n',
 'b',
 ':',
 'õ',
 'ç',
 ')',
 'v',
 'f',
 'ð',
 'ý',
 'j',
 'ō']

In [23]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

y###<your code>###
review_pos=glob.glob("./aclImdb/train/pos/*.txt") #返回所有匹配的文件路徑表
review_neg=glob.glob("./aclImdb/test/neg/*.txt")
review_all=review_pos+[0]*len(review_neg)
y=[1]*len(review_pos)+[0]*len(review_neg)

review_pairs=list(zip(review_all,y))
print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('./aclImdb/train/pos\\0_9.txt', 1), ('./aclImdb/train/pos\\10000_8.txt', 1)]
Total reviews: 25000


In [18]:
review_all

['./aclImdb/train/pos\\0_9.txt',
 './aclImdb/train/pos\\10000_8.txt',
 './aclImdb/train/pos\\10001_10.txt',
 './aclImdb/train/pos\\10002_7.txt',
 './aclImdb/train/pos\\10003_8.txt',
 './aclImdb/train/pos\\10004_8.txt',
 './aclImdb/train/pos\\10005_7.txt',
 './aclImdb/train/pos\\10006_7.txt',
 './aclImdb/train/pos\\10007_7.txt',
 './aclImdb/train/pos\\10008_7.txt',
 './aclImdb/train/pos\\10009_9.txt',
 './aclImdb/train/pos\\1000_8.txt',
 './aclImdb/train/pos\\10010_7.txt',
 './aclImdb/train/pos\\10011_9.txt',
 './aclImdb/train/pos\\10012_8.txt',
 './aclImdb/train/pos\\10013_7.txt',
 './aclImdb/train/pos\\10014_8.txt',
 './aclImdb/train/pos\\10015_8.txt',
 './aclImdb/train/pos\\10016_8.txt',
 './aclImdb/train/pos\\10017_9.txt',
 './aclImdb/train/pos\\10018_8.txt',
 './aclImdb/train/pos\\10019_8.txt',
 './aclImdb/train/pos\\1001_8.txt',
 './aclImdb/train/pos\\10020_8.txt',
 './aclImdb/train/pos\\10021_8.txt',
 './aclImdb/train/pos\\10022_7.txt',
 './aclImdb/train/pos\\10023_9.txt',
 './ac

In [19]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [17]:
review_pairs

[('./aclImdb/train/pos\\0_9.txt', 1),
 ('./aclImdb/train/pos\\10000_8.txt', 1),
 ('./aclImdb/train/pos\\10001_10.txt', 1),
 ('./aclImdb/train/pos\\10002_7.txt', 1),
 ('./aclImdb/train/pos\\10003_8.txt', 1),
 ('./aclImdb/train/pos\\10004_8.txt', 1),
 ('./aclImdb/train/pos\\10005_7.txt', 1),
 ('./aclImdb/train/pos\\10006_7.txt', 1),
 ('./aclImdb/train/pos\\10007_7.txt', 1),
 ('./aclImdb/train/pos\\10008_7.txt', 1),
 ('./aclImdb/train/pos\\10009_9.txt', 1),
 ('./aclImdb/train/pos\\1000_8.txt', 1),
 ('./aclImdb/train/pos\\10010_7.txt', 1),
 ('./aclImdb/train/pos\\10011_9.txt', 1),
 ('./aclImdb/train/pos\\10012_8.txt', 1),
 ('./aclImdb/train/pos\\10013_7.txt', 1),
 ('./aclImdb/train/pos\\10014_8.txt', 1),
 ('./aclImdb/train/pos\\10015_8.txt', 1),
 ('./aclImdb/train/pos\\10016_8.txt', 1),
 ('./aclImdb/train/pos\\10017_9.txt', 1),
 ('./aclImdb/train/pos\\10018_8.txt', 1),
 ('./aclImdb/train/pos\\10019_8.txt', 1),
 ('./aclImdb/train/pos\\1001_8.txt', 1),
 ('./aclImdb/train/pos\\10020_8.txt', 1

In [8]:
review_neg

['./aclImdb/test/neg\\0_2.txt',
 './aclImdb/test/neg\\10000_4.txt',
 './aclImdb/test/neg\\10001_1.txt',
 './aclImdb/test/neg\\10002_3.txt',
 './aclImdb/test/neg\\10003_3.txt',
 './aclImdb/test/neg\\10004_2.txt',
 './aclImdb/test/neg\\10005_2.txt',
 './aclImdb/test/neg\\10006_2.txt',
 './aclImdb/test/neg\\10007_4.txt',
 './aclImdb/test/neg\\10008_4.txt',
 './aclImdb/test/neg\\10009_3.txt',
 './aclImdb/test/neg\\1000_3.txt',
 './aclImdb/test/neg\\10010_2.txt',
 './aclImdb/test/neg\\10011_1.txt',
 './aclImdb/test/neg\\10012_1.txt',
 './aclImdb/test/neg\\10013_4.txt',
 './aclImdb/test/neg\\10014_2.txt',
 './aclImdb/test/neg\\10015_4.txt',
 './aclImdb/test/neg\\10016_3.txt',
 './aclImdb/test/neg\\10017_1.txt',
 './aclImdb/test/neg\\10018_1.txt',
 './aclImdb/test/neg\\10019_1.txt',
 './aclImdb/test/neg\\1001_4.txt',
 './aclImdb/test/neg\\10020_1.txt',
 './aclImdb/test/neg\\10021_3.txt',
 './aclImdb/test/neg\\10022_4.txt',
 './aclImdb/test/neg\\10023_4.txt',
 './aclImdb/test/neg\\10024_3.txt'

### 建立Dataset與DataLoader讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量BoW的函式
(generate_bow)

In [43]:
def load_review(review_path):
    
    ###<your code>###
    with open(review_path, 'r') as f:
        review = f.read()
    #移除non-alphabet(非字母)符號、贅字與tokenize
    ###<your code>###
    review=re.sub('[^a-zA-Z]',' ',review)
    review=nltk.word_tokenize(review)
    review=list(set(review).difference(set(stopwords.words('english'))))
    
    
    return review

In [44]:
def generate_bow(review, vocab_dic):
    bag_vector = np.zeros(len(vocab_dic))
    for word in review:
        if vocab_dic.get(word):
            bag_vector[vocab_dic.get(word)] += 1
            
    return bag_vector

In [45]:
class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_dirs, vocab):
        self.data_dirs=data_dirs
        self.vocab=vocab
        ###<your code>###

    def __len__(self):
        return len(self.data_dirs)
        ###<your code>###

    def __getitem__(self, idx):
        pair=self.data_dirs[idx]
        review=pair[0]
        review=load_review(review)
        review=generate_bow(review,self.vocab)
        
        
        return review,pair[1]
        ###<your code>###
        

In [46]:
# 建立客製化dataset
###<your code>###
custom_dst=dataset(review_pairs,vocab_dic)
custom_dst[10]

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]), 1)

In [47]:
# 建立dataloader
###<your code>###
custom_dataloader=DataLoader(custom_dst,batch_size=4,shuffle=True)
next(iter(custom_dataloader))

OSError: [WinError 6] 控制代碼無效。