## RNN

In [1]:
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rnn = nn.RNN(10,20,2)# (input_size, hidden_size, num_layers)
input =torch.randn(5,3,10) # 5 blood tests # (seq_len,batch_size,input_size) 
h0 = torch.randn(2,3,20) #(num_layers*num_directions, batch,hidden_size)

output,hn = rnn(input,h0) 

In [4]:
output.shape # (seq_len,batch,num_directions *hidden_size)

torch.Size([5, 3, 20])

In [5]:
hn.shape # (num_layers*num_directions, batch, hidden_size)

torch.Size([2, 3, 20])

## Embeddings

In [6]:
embedding = nn.Embedding(10,3) #num_embeddings, embedding_dim
# size of dictionary of embeddings , size of the each embedding vecotr

# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,4,1,9]])
embedding(input)

tensor([[[-0.1134,  0.3053,  1.6914],
         [-1.1581, -0.5896,  1.1041],
         [-0.8190, -0.5953,  0.5450],
         [ 1.1254,  0.6481, -0.5454]],

        [[-0.8190, -0.5953,  0.5450],
         [-0.8190, -0.5953,  0.5450],
         [-0.1134,  0.3053,  1.6914],
         [-0.0785, -0.3750,  1.2156]]], grad_fn=<EmbeddingBackward0>)

# Text Classification


In [2]:
import torch
#!pip install --upgrade torchtext

from torchtext.datasets import AG_NEWS

import random
SEED =5
random.seed(SEED)
torch.manual_seed(SEED)

train_data,test_data = AG_NEWS()

In [3]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')

train_iter = AG_NEWS(split='train')

x,y = next(iter(train_iter))
print(x,y)
print("")
print(tokenizer(y))


3 Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.

['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']


In [7]:
from collections import Counter

train_iter = AG_NEWS(split='train')
counter =Counter()
MAX_LEN =0
labels=[]

for (label,line) in train_iter:
    tokens = tokenizer(line)
    counter.update(tokens) #dictionary로 word 세서 반환해주네 
    MAX_LEN =max(MAX_LEN, len(tokens))
    labels.append(label)
    
num_class = len(set(labels))


In [48]:
counter

Counter({'wall': 1395,
         'st': 1409,
         '.': 225971,
         'bears': 399,
         'claw': 17,
         'back': 4123,
         'into': 6637,
         'the': 203843,
         'black': 761,
         '(': 41106,
         'reuters': 19310,
         ')': 40787,
         '-': 39206,
         'short-sellers': 2,
         ',': 165685,
         'street': 1581,
         "'": 32235,
         's': 61724,
         'dwindling\\band': 1,
         'of': 97909,
         'ultra-cynics': 2,
         'are': 9723,
         'seeing': 135,
         'green': 828,
         'again': 1758,
         'carlyle': 15,
         'looks': 600,
         'toward': 758,
         'commercial': 490,
         'aerospace': 124,
         'private': 696,
         'investment': 809,
         'firm': 1776,
         'group': 4676,
         '\\which': 5,
         'has': 18945,
         'a': 110153,
         'reputation': 117,
         'for': 50186,
         'making': 1114,
         'well-timed': 2,
         'and': 688

In [55]:
from torchtext.vocab import Vocab

In [8]:
from torchtext.vocab import vocab
vocabs = vocab(counter, min_freq=10,specials = ["<pad>","<unk>"],)

PAD =vocabs["<pad>"]


TypeError: __init__() got an unexpected keyword argument 'min_freq'

In [50]:
import torchtext
torchtext.__version__

'0.13.1'

In [57]:
vocabs = vocab(counter, min_freq=10,specials = ["<pad>","<unk>"])
vocabs.load_state_dict(

In [54]:
vocabs["and"]

38

In [13]:

print(len(vocabs))

20645


In [14]:
type(vocabs)

torchtext.vocab.vocab.Vocab

In [15]:
[vocabs[token] for token in ['here', 'is', 'an', 'example','<pad>']]

[797, 163, 81, 2186, 0]

In [16]:
text_pipeline = lambda x:[vocabs[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x)-1

In [19]:
print(text_pipeline('here is an example'))


[797, 163, 81, 2186]


In [20]:
#데이터 배치와 반복자 생성하기

from torch.utils.data import DataLoader 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

<torchtext._torchtext.Vocab at 0x2a4f280e8f0>

In [21]:
def collate_batch(batch):
    label_list, text_list =[],[]
    max_len=0
    for (_label,_text) in batch:
        label_list.append(label_pipeline(_label))
        token_ids = text_pipeline(_text)
        text_list.append(token_ids)
        max_len = max(max_len,len(token_ids))
    
    text_list = [tokens +[PAD] * (max_len - len(tokens)) for tokens in text_list]
    text_list = torch.tensor(text_list, dtype =torch.int64)
    label_list = torch.tensor(label_list,dtype = torch.int64)

    return label_list.to(device), text_list.to(device)

In [58]:
#Define the model

from torchtext.vocab import vectors
from torch import nn

class TextClassification(nn.Module):
    def __init__(self, vocab,num_class, embed_dim,hidden_dim,num_rnn_layers=1):
        super(TextClassification,self).__init__()
        self.embedding = nn.Embedding.from_pretrained(vocab.vectors)
        #self.embedding = nn.Embedding(len(vocab),MAX_LEN)
        
        self.rnn =nn.RNN(embed_dim,hidden_dim,num_layers = num_rnn_layers,batch_first =True)
        self.fc = nn.Linear(hidden_dim,num_class)
        
    def forward(self, input_text):
        embedded =self.embedding(input_text)
        output, _ =self.rnn(embedded)
        h_n = output[:,-1,:]
        pred = self.fc(h_n)
        return pred
        
        

In [None]:
nn.Embedding.from_pretrained()

In [64]:
vocabs.load_vectors("glove.6B.100d")

AttributeError: 'Vocab' object has no attribute 'load_vectors'

In [59]:

model = TextClassification(vocabs,num_class,100,128,num_rnn_layers=1)

AttributeError: 'Vocab' object has no attribute 'vectors'

In [21]:
import time

def train(dataloader, model,criterion,optimizer):
    
    model.train()
    total_acc, total_count = 0,0
    log_interval =500
    strat_time = time.time()
    
    loss_cum = 0.0
    
    for idx, (label,text) in enumerate(dataloader):
        
        optimizer.zero_grad()

        predicted_label = model(text)
        
        loss = criterion(predicted_label,label)
        loss.backward()
        loss_cum += loss.item()
        optimizer.step()
        
        total_acc +=(predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| loss{:8.3f} | accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              loss_cum/total_count, total_acc/total_count))
            total_acc, total_count,loss_cum = 0, 0, 0.0
            start_time = time.time()

def evaluate(dataloader,model):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count
        
        

In [22]:
LR = 1e-4
EPOCHS =5
BATCH_SIZE =8
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=LR)

In [23]:
from torch.utils.data.dataset import random_split

train_iter, test_iter = AG_NEWS()
train_dataset = list(train_iter)
test_dataset =  list(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=False, collate_fn=collate_batch)

'''
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader,model,criterion,optimizer)
    accu_val = evaluate(valid_dataloader)
    
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)
'''    

"\nfor epoch in range(1, EPOCHS + 1):\n    epoch_start_time = time.time()\n    train(train_dataloader,model,criterion,optimizer)\n    accu_val = evaluate(valid_dataloader)\n    \n    print('-' * 59)\n    print('| end of epoch {:3d} | time: {:5.2f}s | '\n          'valid accuracy {:8.3f} '.format(epoch,\n                                           time.time() - epoch_start_time,\n                                           accu_val))\n    print('-' * 59)\n"

In [None]:
for idx, (label,text) in enumerate(train_dataloader):
    print(text)
    break

In [1]:
train_iter, test_iter = AG_NEWS()
test_dataset =  list(test_iter)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=False, collate_fn=collate_batch)
accu_val = evaluate(test_dataloader,model)
print(f"Test accuracy{accu_val:8.3f}")

NameError: name 'AG_NEWS' is not defined