In [2]:
import random
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.vocab import vocab
from collections import Counter
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


multi30k = load_dataset("bentrevett/multi30k")
train, test = multi30k['train'], multi30k['test']

In [3]:
train[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [4]:
ger_counter = Counter()
eng_counter = Counter()
for data in tqdm(train):
    ger_counter.update(tokenizer_ger(data['de'].lower()))
    eng_counter.update(tokenizer_eng(data['en'].lower()))   

100%|███████████████████████████████████| 29000/29000 [00:22<00:00, 1269.94it/s]


In [5]:
ger_vocab = vocab(ger_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))
eng_vocab = vocab(eng_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))
ger_vocab.set_default_index(ger_vocab["<unk>"])
eng_vocab.set_default_index(eng_vocab["<unk>"])
print(f"Size of German Vocab : {len(ger_vocab)}\n Size of English Vocab : {len(eng_vocab)}")

Size of German Vocab : 7853
 Size of English Vocab : 5893


In [6]:
text_transform_eng = lambda x: [eng_vocab['<sos>']] + [eng_vocab[token.lower()] for token in tokenizer_eng(x)] + [eng_vocab['<eos>']]
text_transform_ger = lambda x: [ger_vocab['<sos>']] + [ger_vocab[token.lower()] for token in tokenizer_ger(x)] + [ger_vocab['<eos>']]

In [7]:
import torch
def collate_batch(batch):
    src_list, tgt_list = [], []
    for data in batch:
        src_list.append(torch.tensor(text_transform_ger(data['de'])))
        tgt_list.append(torch.tensor(text_transform_eng(data['en'])))

    src_list = pad_sequence(src_list, padding_value=eng_vocab['<pad>']).T
    tgt_list = pad_sequence(tgt_list, padding_value=ger_vocab['<pad>']).T
    
    inp = {
        "src": src_list,
        "tgt": tgt_list
    }

    return inp

In [8]:
from transformer_from_scratch_v1 import Transformer
import torch
import math
import torch.nn as nn

num_epochs = 30
batch_size = 16
learning_rate = 1e-3
weight_decay = 0.001
writer = SummaryWriter(f"runs/loss")

train_dataloader = DataLoader(
    train,
    collate_fn = collate_batch,
    shuffle = True,
    batch_size = batch_size,
    pin_memory = True
)

test_dataloader = DataLoader(
    test,
    collate_fn = collate_batch,
    shuffle = False,
    batch_size = batch_size,
    pin_memory = True
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer_model = Transformer(
    src_mask_ind = ger_vocab['<pad>'],
    trg_mask_ind = eng_vocab['<pad>'],
    src_vocab_size = len(eng_vocab),
    trg_vocab_size = len(ger_vocab),
    device = device
).to(device)

In [9]:
total_steps = num_epochs*math.ceil(len(train)/batch_size)
optimizer = torch.optim.Adam(transformer_model.parameters(), lr = learning_rate)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr = learning_rate,
    total_steps = total_steps,
    pct_start = 0.33,
    div_factor = 1e3,
    final_div_factor = 1e2
)
criterion = nn.CrossEntropyLoss(ignore_index = eng_vocab['<pad>'])

In [10]:
ex = collate_batch(train)

In [11]:
ex['src'].shape

torch.Size([29000, 42])

In [24]:
def translate_seq(model, src, device,max_len = 50):
    model.eval()
    trg_indexes = [ger_vocab["<sos>"]]
    for i in range(max_len):
        trg = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        with torch.no_grad():
            output = transformer_model(src,trg)
        output[:,:,:2] = float("-1e20")
        output = output[:,-1,:]
        output = torch.softmax(output, dim =-1)
        pred_token = output.argmax(-1).item()
        trg_indexes.append(pred_token)
        if pred_token == eng_vocab["<eos>"]:
            break
    return trg_indexes
        

In [25]:
ex = test[random.randint(0, len(test))]
sentence = ex['de']
src_indexes = torch.tensor(text_transform_eng(sentence)).unsqueeze(0).to(device)
translated_sentence_idx = translate_seq(transformer_model, src_indexes, device=device, max_len=30)
translated_sentence = [eng_vocab.get_itos()[i] for i in translated_sentence_idx]
print(f"Generated Translation : \n {' '.join(translated_sentence[1:-1])}\n")
print(f"Original Translation : \n{ex['en']}\n")


IndexError: list index out of range

In [17]:
def transform_temperature_data(temperature_data):
    result = {}
    
    for date, temperatures in temperature_data.items():
        max_temp = max(temperatures)
        min_temp = min(temperatures)
        average_temp = sum(temperatures) / len(temperatures)
        
        result[date] = {
            'max_temp': max_temp,
            'min_temp': min_temp,
            'average_temp': round(average_temp, 2)
        }
    
    return result

# Sample input for testing
temperature_data = {
    '2023-01-01': [25, 28, 20, 18, 30, 22],
    '2023-01-02': [22, 24, 26, 20, 28, 30]
}

# Call the function with the sample input
result = transform_temperature_data(temperature_data)
print(result)  # This will print the transformed temperature data

{'2023-01-01': {'max_temp': 30, 'min_temp': 18, 'average_temp': 23.83}, '2023-01-02': {'max_temp': 30, 'min_temp': 20, 'average_temp': 25.0}}


In [19]:
ger_vocab["<unk>"]

0