In [1]:
# custom datset 만들기(easy.ver)

import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset1(Dataset):
    def __init__(self,t):
        self.t = t

    def __len__(self):
        return self.t

    def __getitem__(self,idx):
        return torch.LongTensor([idx])


if __name__ == '__main__':
    dataset = MyDataset1(t=5)
    print(len(dataset))
    it = iter(dataset)

    for i in range(10):
        print(i, next(it))


5
0 tensor([0])
1 tensor([1])
2 tensor([2])
3 tensor([3])
4 tensor([4])
5 tensor([5])
6 tensor([6])
7 tensor([7])
8 tensor([8])
9 tensor([9])


In [1]:
# custom datset 만들기2(easy.ver)

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# input_data = np.array([1,2,3,4,5,6,7,8,9,10,11,12])
# input_data = np.array([[1,2],[3,4,5],[6,7]])
# input_data = np.arange(12).reshape(3,4)
input_data = np.random.randint(0, 250, (10,3,10,10))


class MyDataset1(Dataset):
    def __init__(self, data):
        self.data = data
        self.length = len(data)

    def __len__(self):
        return self.length

    def __getitem__(self, idx): # 한 개의 데이터에 대해서 적용할 
        x = np.array([idx])
        y = self.data[idx]
        return x, y


if __name__ == '__main__':
    dataset = MyDataset1(input_data)
    dataloader = DataLoader(dataset = dataset, batch_size=5, shuffle = True)
    for loop in range(2):
        print(loop)
        for batch in dataloader:
            data_index, output_data = batch
            print(f'index : {data_index.shape}')
            print(f'data : {output_data.shape}')
            # print('---')

0
index : torch.Size([5, 1])
data : torch.Size([5, 3, 10, 10])
index : torch.Size([5, 1])
data : torch.Size([5, 3, 10, 10])
1
index : torch.Size([5, 1])
data : torch.Size([5, 3, 10, 10])
index : torch.Size([5, 1])
data : torch.Size([5, 3, 10, 10])


### DataLoader의 batch_size와 Dataset의 index 사이의 관계
- batch_size만큼 dataset에서 데이터를 가져온다. batch_size만큼 임의의 인덱스를 뽑아주면 그 인덱스 번호에 맞는 데이터를 가져온다. shuffle을 True로 설정하면 데이터가 섞여서 가져오지만 False로 설정하면 데이터를 순서대로 가져온다.
    - ex)  
        data = [1,2,3,4,5,6,7,8,9,10]  
        batch_size = 3 -> 임의의 번호 3개(index)를 선정 => (0,1,4), (3,5,2), (9,7,8), (6)  
        data에서 첫번째로 (0,1,4)번에 해당하는 데이터 (1,2,5)을 가져옴  
        두번째로 (3,5,2)번에 해당하는 데이터 (4,6,3)을 가져옴  
        세번째로 (9,7,8)번에 해당하는 데이터 (10,8,9)을 가져옴  
        마지막으로 (6)번에 해당하는 데이터 (7)을 가져옴

- DataLoader가 Dataset의 len을 먼저 호출하여 전체 데이터 개수를 확인 -> 전체 데이터 개수를 확인후 그 안에서 인덱스 번호를 임의로 선정

- batch에 들어가는 데이터의 size는 동일해야 한다. 만약 동일하지 않은 경우는 어떻게 해야 할까??
    - 텍스트 데이터로 구현해보기

In [2]:
# custom datset 만들기2(easy.ver)

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# input_data = np.array([1,2,3,4,5,6,7,8,9,10,11,12])
input_data = np.arange(12).reshape(3,4)


class MyDataset1(Dataset):
    def __init__(self, data):
        self.data = data
        self.length = len(data)

    def __len__(self):
        return self.length

    def __getitem__(self, idx): # 한 개의 데이터에 대해서 적용할 
        x = np.array([idx])
        y = self.data[idx]
        return x, y


if __name__ == '__main__':
    dataset = MyDataset1(input_data)
    dataloader = DataLoader(dataset = dataset, batch_size=5, shuffle = True)
    for loop in range(2):
        print(loop)
        for batch in dataloader:
            data_index, output_data = batch
            print(f'index : {data_index}')
            print(f'data : {output_data}')
            print('---')

0
index : tensor([[0],
        [2],
        [1]])
data : tensor([[ 0,  1,  2,  3],
        [ 8,  9, 10, 11],
        [ 4,  5,  6,  7]])
---
1
index : tensor([[1],
        [2],
        [0]])
data : tensor([[ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [ 0,  1,  2,  3]])
---


### 데이터의 size가 다른 경우 

In [79]:
sentences = [
            'I like apple',
            'I dont like apple',
            'My name is Tony Stark',
            'I love you',
            'I am hungry',
            'What is your name'
]

labels = [
       1,
       2,
       3,
       4,
       5,
       6
]



In [60]:
# 토큰화 방법1

tokenized = []

for sentence in sentences:
    tokenized.append(sentence.split())

print(tokenized)

[['I', 'like', 'apple'], ['I', 'dont', 'like', 'apple'], ['My', 'name', 'is', 'Tony', 'Stark'], ['I', 'love', 'you'], ['I', 'am', 'hungry'], ['What', 'is', 'your', 'name']]


In [9]:
# 빈도수별로 단어장 형성

from nltk import FreqDist

vocab = FreqDist(np.hstack(tokenized))
print(len(vocab))

9


In [61]:
# 단어장 형성2(numpy 활용)

token_word = set(np.hstack(tokenized))
token_word

{'I',
 'My',
 'Stark',
 'Tony',
 'What',
 'am',
 'apple',
 'dont',
 'hungry',
 'is',
 'like',
 'love',
 'name',
 'you',
 'your'}

In [62]:
# 단어별 인덱스 설정

word_to_index = {}

for index, word in enumerate(token_word):
    word_to_index[word] = index + 2
word_to_index['pad'] = 1
word_to_index['unk'] = 0

word_to_index


{'I': 13,
 'My': 4,
 'Stark': 5,
 'Tony': 3,
 'What': 8,
 'am': 10,
 'apple': 11,
 'dont': 12,
 'hungry': 9,
 'is': 14,
 'like': 2,
 'love': 6,
 'name': 16,
 'pad': 1,
 'unk': 0,
 'you': 7,
 'your': 15}

In [63]:
# 단어를 인덱스로 바꿔주기(인코딩)

encoded = []
for line in tokenized:
    temp = []
    for w in line:
        try:
            temp.append(word_to_index[w])
        except:
            temp.append(word_to_index['unk'])

    encoded.append(temp)

encoded

[[13, 2, 11],
 [13, 12, 2, 11],
 [4, 16, 14, 3, 5],
 [13, 6, 7],
 [13, 10, 9],
 [8, 14, 15, 16]]

In [31]:
# 최대 길이 확인

max_len = max(len(l) for l in encoded)
max_len

5

In [32]:
# 일반적인 padding. 가장 긴 문자열을 기준으로 나머지를 padding 값으로 채워준다.
# 단점 : 메모리의 낭비가 심하다.

for line in encoded:
    if len(line) < max_len:
        line += [word_to_index['pad']] * (max_len - len(line))

encoded

[[13, 2, 11, 1, 1],
 [13, 12, 2, 11, 1],
 [4, 16, 14, 3, 5],
 [13, 6, 7, 1, 1],
 [13, 10, 9, 1, 1],
 [8, 14, 15, 16, 1]]

In [72]:
# 데이터의 size가 다른 경우 batch size마다 가장 긴 문자열을 기준으로 padding을 적용

from torch.nn.utils.rnn import pad_sequence

class MyDataset2(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.labels = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        
        return [self.data[index], self.labels[index]]

In [76]:
class CustomCollate(object):
    def __init__(self):
        pass

    def __call__(self, batch_data):
        inputs = [torch.tensor(item[0]) for item in batch_data]
        labels = torch.tensor([item[1] for item in batch_data])

        inputs = pad_sequence(inputs, batch_first=True)

        return [inputs, labels]


In [84]:
dataset = MyDataset2(encoded, labels)
collate = CustomCollate()
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate)

for epochs in range(3):
    print('epochs :', epochs)
    print()
    for data, label in loader:
        print(data)
        print(label)
        print()

epochs : 0

tensor([[13,  2, 11],
        [13,  6,  7]])
tensor([1, 4])

tensor([[13, 12,  2, 11,  0],
        [ 4, 16, 14,  3,  5]])
tensor([2, 3])

tensor([[ 8, 14, 15, 16],
        [13, 10,  9,  0]])
tensor([6, 5])

epochs : 1

tensor([[13,  2, 11],
        [13,  6,  7]])
tensor([1, 4])

tensor([[ 4, 16, 14,  3,  5],
        [13, 12,  2, 11,  0]])
tensor([3, 2])

tensor([[13, 10,  9,  0],
        [ 8, 14, 15, 16]])
tensor([5, 6])

epochs : 2

tensor([[13, 12,  2, 11],
        [ 8, 14, 15, 16]])
tensor([2, 6])

tensor([[13,  2, 11],
        [13,  6,  7]])
tensor([1, 4])

tensor([[13, 10,  9,  0,  0],
        [ 4, 16, 14,  3,  5]])
tensor([5, 3])



### 의문?
- 커스텀한 collate_fn을 사용하면 __getitem__은 따로 작성하지 않아도 작동하는 건가?
    - __getitem__을 빼고 만들었는데도 동작하네..?

### size가 다른 데이터의 경우 dataloader를 이용해 불러오기(실습)

In [25]:
# IMDB 데이터 가져오기

import urllib.request
import pandas as pd

urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x7fb65400d1d0>)

In [39]:
# 데이터를 데이터프레임으로 만들기
df = pd.read_csv('IMDb_Reviews.csv')
df

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0
...,...,...
49995,the people who came up with this are SICK AND ...,0
49996,"The script is so so laughable... this in turn,...",0
49997,"""So there's this bride, you see, and she gets ...",0
49998,Your mind will not be satisfied by this nobud...,0


In [6]:
# 전체 데이터의 개수
len(df)

50000

In [40]:
# 샘플용 데이터로 100개만 사용
sample = df[:100]
sample

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0
...,...,...
95,I can honestly tell you that this movie is the...,1
96,I first came across this film when I read a bo...,1
97,I don't think I will include any spoilers but ...,1
98,"a real hoot, unintentionally. sidney portier's...",0


In [41]:
# sample 전처리. 불필요한 문자 제거

sample['review'] = sample['review'].str.replace('[^a-zA-Z0-9 ]', '')
sample.head()

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,Believe it or not this was at one time the wor...,0
2,After some internet surfing I found the Homefr...,0
3,One of the most unheralded great works of anim...,1
4,It was the Sixties and anyone with long hair a...,0


In [None]:
# 토큰화

tokenized = []
for sentence in sample['review']:
    tokenized.append(sentence.split())

tokenized

In [11]:
from nltk import FreqDist


voca = FreqDist(np.hstack(tokenized))
len(voca)

5483

In [None]:
# 단어장 만들기

voca = list(set(np.hstack(tokenized)))
print(voca)
print(len(voca))

In [None]:
# 단어마다 인덱스 정해주기

word_to_index = {v : idx+2 for idx, v in enumerate(voca)}
word_to_index['unk'] = 1
word_to_index['pad'] = 0

word_to_index

In [None]:
# 인코더 데이터 만들기

encoded = []

for line in tokenized:
    temp = []
    for w in line:
        try:
            temp.append(word_to_index[w])
        except:
            temp.append(word_to_index['unk'])
    encoded.append(temp)

encoded

In [51]:
max_len = max(len(line) for line in encoded)
min_len = min(len(line) for line in encoded)
print(max_len, min_len)

864 28


In [16]:
# 일반적인 padding 방법. 가장 큰 문자열을 기준으로 padding 적용

# for line in encoded:
#     if len(line) < max_len:
#         line += [word_to_index['pad']] * (max_len - len(line))

In [17]:
# max_len = max(len(line) for line in encoded)
# min_len = min(len(line) for line in encoded)
# print(max_len, min_len)

864 864


In [56]:
# dataset 만들기

# class MyDataset3(Dataset):

#     def __init__(self, data):
#         self.data = data

#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, index):
        

def custom_collate2(data):
    input_line = [torch.tensor(line) for line in data]
    input_pad = pad_sequence(input_line, batch_first=True)
    return input_pad

if __name__ == '__main__':
    # dataset = encoded
    dataloader = DataLoader(encoded, 5, shuffle = True, collate_fn=custom_collate2)
    
    for epoch in range(5):
        print(f'epochs : {epoch + 1}')
        for batch in dataloader:
            print(batch.shape)
    

epochs : 1
torch.Size([5, 196])
torch.Size([5, 724])
torch.Size([5, 760])
torch.Size([5, 317])
torch.Size([5, 427])
torch.Size([5, 352])
torch.Size([5, 649])
torch.Size([5, 719])
torch.Size([5, 523])
torch.Size([5, 288])
torch.Size([5, 360])
torch.Size([5, 810])
torch.Size([5, 603])
torch.Size([5, 545])
torch.Size([5, 227])
torch.Size([5, 206])
torch.Size([5, 370])
torch.Size([5, 391])
torch.Size([5, 235])
torch.Size([5, 864])
epochs : 2
torch.Size([5, 719])
torch.Size([5, 204])
torch.Size([5, 810])
torch.Size([5, 317])
torch.Size([5, 496])
torch.Size([5, 420])
torch.Size([5, 760])
torch.Size([5, 291])
torch.Size([5, 590])
torch.Size([5, 370])
torch.Size([5, 308])
torch.Size([5, 864])
torch.Size([5, 352])
torch.Size([5, 603])
torch.Size([5, 523])
torch.Size([5, 182])
torch.Size([5, 545])
torch.Size([5, 724])
torch.Size([5, 427])
torch.Size([5, 317])
epochs : 3
torch.Size([5, 427])
torch.Size([5, 864])
torch.Size([5, 192])
torch.Size([5, 225])
torch.Size([5, 760])
torch.Size([5, 291])
t

In [26]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate(data):
    inputs = [torch.tensor(line) for line in data]
    inputs = pad_sequence(inputs, batch_first=True)

    return inputs

if __name__ == '__main__':
    dataloader = DataLoader(encoded, 5, shuffle = True, collate_fn=custom_collate)
    
    for epoch in range(5):
        print(f'epochs : {epoch + 1}')
        for batch in dataloader:
            print(batch.shape)



epochs : 1
torch.Size([5, 427])
torch.Size([5, 724])
torch.Size([5, 545])
torch.Size([5, 350])
torch.Size([5, 523])
torch.Size([5, 590])
torch.Size([5, 288])
torch.Size([5, 203])
torch.Size([5, 370])
torch.Size([5, 239])
torch.Size([5, 496])
torch.Size([5, 760])
torch.Size([5, 810])
torch.Size([5, 603])
torch.Size([5, 864])
torch.Size([5, 420])
torch.Size([5, 360])
torch.Size([5, 719])
torch.Size([5, 196])
torch.Size([5, 649])
epochs : 2
torch.Size([5, 288])
torch.Size([5, 724])
torch.Size([5, 810])
torch.Size([5, 370])
torch.Size([5, 263])
torch.Size([5, 523])
torch.Size([5, 719])
torch.Size([5, 317])
torch.Size([5, 864])
torch.Size([5, 150])
torch.Size([5, 427])
torch.Size([5, 545])
torch.Size([5, 220])
torch.Size([5, 359])
torch.Size([5, 420])
torch.Size([5, 590])
torch.Size([5, 603])
torch.Size([5, 308])
torch.Size([5, 649])
torch.Size([5, 203])
epochs : 3
torch.Size([5, 350])
torch.Size([5, 810])
torch.Size([5, 206])
torch.Size([5, 590])
torch.Size([5, 864])
torch.Size([5, 391])
t