In [2]:
# Based on this paper - https://arxiv.org/pdf/1706.03762.pdf
# Might want to move layer norm inside the residual block - https://arxiv.org/pdf/2002.04745.pdf
# Layer normalization - https://arxiv.org/pdf/1607.06450.pdf
# TODO: Investigate learning rate warmup - https://arxiv.org/abs/2002.04745
#!pip install torch torchtext sentencepiece datasets wandb

In [3]:
import numpy as np
import torch
from torch import nn
import sys
import os
import math
import einops
import torch.nn.functional as F

sys.path.append(os.path.abspath("../../data"))
sys.path.append(os.path.abspath("../../nnets"))
from net_utils import get_module_list

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
SP_VOCAB_SIZE = 5000
TRAIN_SIZE = 5000

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from text_data import CNNDatasetDecoderOnly

class Wrapper(CNNDatasetDecoderOnly):
    split_lengths = [TRAIN_SIZE, math.floor(TRAIN_SIZE * .1), 100]
    x_length = 15
    target_length = 15

wrapper = Wrapper(SP_VOCAB_SIZE)
datasets = wrapper.generate_datasets(BATCH_SIZE)
train = datasets["train"]
valid = datasets["validation"]

Found cached dataset cnn_dailymail (/Users/vik/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 3/3 [00:00<00:00, 50.36it/s]
sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=tokens.txt --model_prefix=cnn_dailymail --vocab_size=5000 --model_type=unigram
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: tokens.txt
  input_format: 
  model_prefix: cnn_dailymail
  model_type: UNIGRAM
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piec

In [5]:
# Add in ROPE embedding
class ROPE(nn.Module):
    def __init__(self, embedding_dim, seq_len):
        super(ROPE, self).__init__()
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len

        self.cos_embeds = torch.zeros(seq_len, embedding_dim, device=DEVICE)
        self.sin_embeds = torch.zeros(seq_len, embedding_dim, device=DEVICE)

        embed_pos = 10000 ** (-2 * torch.ceil((torch.arange(0, embedding_dim) + 1) / 2) / embedding_dim)
        for i in range(0, seq_len):
            self.cos_embeds[i] = torch.cos((i + 1) * embed_pos)
            self.sin_embeds[i] = torch.sin((i + 1) * embed_pos)

        self.indices = torch.zeros(self.embedding_dim, device=DEVICE, dtype=torch.long)
        self.mask = torch.zeros(self.embedding_dim, device=DEVICE, dtype=torch.int)
        for i in range(0, embedding_dim, 2):
            self.indices[i] = i + 1
            self.indices[i+1] = i

            self.mask[i] = -1
            self.mask[i+1] = 1


    def rotate(self, x):
        return x[...,self.indices] * self.mask

    def forward(self, x):
        current_val = x * self.cos_embeds[:x.shape[-2],:]
        next_val = self.rotate(x) * self.sin_embeds[:x.shape[-2],:]
        return current_val + next_val


class MultiHeadAttention(nn.Module):
    def __init__(self, input_units, attention_heads, mask=False):
        super(MultiHeadAttention, self).__init__()
        self.input_units = input_units
        self.attention_heads = attention_heads
        self.head_units = int(input_units/attention_heads)
        self.mask = mask

        k = math.sqrt(1/self.input_units)
        # Drop bias
        # Single kv head
        self.kv_proj_weight = nn.Parameter(torch.rand(2, input_units, self.head_units) * 2 * k - k)
        self.q_proj_weight = nn.Parameter(torch.rand(input_units, self.attention_heads * self.head_units) * 2 * k - k)
        self.out_proj_weight = nn.Parameter(torch.rand(self.attention_heads * self.head_units, input_units) * 2 * k - k)

        # 1024 is max sequence length
        self.rope = ROPE(self.head_units, 1024)

    def forward(self, queries, keys, values):
        # convert to 4d tensor with batch_size, attn_heads, seq_len, embedding_dim
        proj_queries = torch.einsum("...se, eo->...so", queries, self.q_proj_weight)
        proj_queries = proj_queries.view(queries.shape[0], queries.shape[1], self.attention_heads, self.head_units).swapaxes(1,2)
        proj_queries = self.rope(proj_queries)

        proj_keys = torch.einsum("...se, eo->...so", keys, self.kv_proj_weight[0])
        proj_keys = proj_keys.view(keys.shape[0], keys.shape[1], self.head_units)
        proj_keys = self.rope(proj_keys)

        proj_values = torch.einsum("...se, eo->...so", values, self.kv_proj_weight[0])
        proj_values = proj_values.view(values.shape[0], values.shape[1], self.head_units)

        attention = torch.einsum("baqh, bhk->baqk", proj_queries, torch.transpose(proj_keys, -1, -2)) / np.sqrt(proj_keys.shape[-1])
        if self.mask:
            # Prevent decoder queries from looking at tokens that come after
            # Do this by setting attention to negative infinity, so it is softmaxed to zero in the next step
            mask = torch.full((attention.shape[-2], attention.shape[-1]), -torch.inf, device=DEVICE)
            attention += torch.triu(mask, diagonal=1)

        # Softmax on last dimension
        # Sequence-wise softmax, so attention between one sequence and other sequences sums to 1
        attention = torch.softmax(attention, dim=-1)
        weighted_values = torch.einsum("baqk, bke->baqe", attention, proj_values)

        # Swap attention head and sequence axis, then reshape to batch, seq, embedding
        weighted_values = weighted_values.swapaxes(1,2).reshape(queries.shape[0], queries.shape[1], -1)
        weighted_values = torch.einsum("...se, eo->...so", weighted_values, self.out_proj_weight)
        return weighted_values

In [6]:
class SwiGLU(nn.Module):
    def __init__(self, input_units, hidden_units):
        super(SwiGLU, self).__init__()
        self.linear1 = nn.Linear(input_units, hidden_units, bias=False)
        self.linear2 = nn.Linear(input_units, hidden_units, bias=False)
        self.linear3 = nn.Linear(hidden_units, input_units, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x1 = self.linear1(x)
        swish = self.sigmoid(x1) * x1
        x2 = self.linear2(x)
        swiglu = self.linear3(swish * x2)
        return swiglu

class DecoderBlock(nn.Module):
    def __init__(self, input_units, attention_heads, hidden_units=2048, dropout_p=.1):
        super(DecoderBlock, self).__init__()
        self.in_attn = MultiHeadAttention(input_units, attention_heads, mask=True)
        self.dropouts = get_module_list(2, nn.Dropout, dropout_p)
        # Drop bias
        self.lns = get_module_list(2, nn.LayerNorm, input_units)
        # Switch to swiglu from two linear layers
        self.swiglu = SwiGLU(input_units, hidden_units)

    def forward(self, x):
        weighted_values = self.dropouts[0](self.in_attn(x, x, x))
        # Pre normalization
        x = x + self.lns[0](weighted_values)

        reprojected = self.dropouts[1](self.swiglu(x))
        # Pre normalization
        x = x + self.lns[1](reprojected)
        return x

In [7]:
class Transformer(nn.Module):
    def __init__(self, input_units, hidden_units, attention_heads, max_len=256, blocks=1):
        super(Transformer, self).__init__()
        self.blocks = blocks
        self.dropouts = get_module_list(2, nn.Dropout, .1)
        self.decoders = get_module_list(blocks, DecoderBlock, hidden_units, attention_heads)

        self.embedding = nn.Parameter(torch.empty(input_units, hidden_units))
        nn.init.xavier_uniform_(self.embedding)
        self.input_units = input_units

    # Tie input output weights
    def embed(self, x, reverse=False):
        if reverse:
            return x @ self.embedding.T
        else:
            embedded = self.embedding[x.to(torch.long).view(-1)]
            return embedded.view(x.shape[0], x.shape[1], -1)


    def forward(self, x):
        dec_outputs = self.dropouts[1](self.embed(x))
        for i in range(self.blocks):
            dec_outputs = self.decoders[i](dec_outputs)

        token_vectors = self.embed(dec_outputs, reverse=True)
        return token_vectors

In [8]:
def generate(sequence, pred, target, wrapper):
    prompts = wrapper.decode_batch(sequence[:,:wrapper.x_length].cpu())
    texts = wrapper.decode_batch(torch.argmax(pred[:,wrapper.x_length:], dim=2).cpu())
    correct_texts = wrapper.decode_batch(target[:,wrapper.x_length:].cpu())

    displays = []
    for p, t, ct in zip(prompts, texts, correct_texts):
        displays.append(f"{p} | {ct} | {t}")
    return displays

In [9]:
from tqdm.auto import tqdm
import wandb
import time

wandb.init(project="decoder-only", notes="Split embedding weights", name="split-embed")

# TODO: Profile and improve perf - https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
model = Transformer(wrapper.vocab_size, 512, 8, blocks=6).to(DEVICE)
loss_fn = nn.CrossEntropyLoss(ignore_index=wrapper.pad_token)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
wandb.watch(model, log_freq=100)

[34m[1mwandb[0m: Currently logged in as: [33mvikp[0m. Use [1m`wandb login --relogin`[0m to force relogin


[]

In [10]:
EPOCHS = 100
DISPLAY_BATCHES = 2
OUT_SEQUENCE_LEN = wrapper.y_length
PRINT_VALID = True
ACCUMULATE_STEPS = 1

for epoch in range(EPOCHS):
    # Run over the training examples
    train_loss = 0
    match_pct = 0
    optimizer.zero_grad(set_to_none=True)
    start = time.time()
    for batch, (sequence, target, prev_target) in tqdm(enumerate(train)):
        pred = model(sequence.to(DEVICE))

        # If you use a batch, need to reshape pred to be batch * sequence, embedding_len to be compatible
        # Similar reshape with target to be batch * sequence vector of class indices
        cpred = pred[:, wrapper.x_length:]
        ctarget = target[:, wrapper.x_length:]
        loss = loss_fn(cpred.reshape(-1, cpred.shape[-1]), ctarget.reshape(-1).to(DEVICE))
        loss.backward()
        train_loss += loss.item()

        # Accumulate gradients
        # This seems to perform worse than no accumulation over a
        # small data set.  Test with larger set.
        if batch % ACCUMULATE_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
    end = time.time()

    with torch.no_grad():
        mean_loss = train_loss / len(train) / BATCH_SIZE
        wandb.log({"loss": mean_loss, "epoch_time": end - start})
        print(f"Epoch {epoch} train loss: {mean_loss}")
        sents = generate(sequence, pred, target, wrapper)
        for sent in sents[:DISPLAY_BATCHES]:
            print(sent)

        if PRINT_VALID and epoch % 10 ==0:
            # Compute validation loss.  Unless you have a lot of training data, the validation loss won't decrease.
            valid_loss = 0
            # Deactivate dropout layers
            model.eval()
            for batch, (sequence, target, prev_target) in tqdm(enumerate(valid)):
                # Inference token by tokens
                outputs = sequence[:,:(wrapper.x_length + 1)].to(DEVICE)
                # TODO: Investigate memory leak with valid generation
                for i in range(OUT_SEQUENCE_LEN):
                    pred = model(outputs)
                    last_output = torch.argmax(pred, dim=2)
                    outputs = torch.cat((outputs, last_output[:,-1:]), dim=1)

                cpred = pred[:, wrapper.x_length:]
                ctarget = target[:, wrapper.x_length:]
                loss = loss_fn(cpred.reshape(-1, cpred.shape[-1]), ctarget.reshape(-1).to(DEVICE))
                valid_loss += loss.item()
            mean_loss = valid_loss / len(valid) / BATCH_SIZE
            wandb.log({"valid_loss": mean_loss})
            print(f"Valid loss: {mean_loss}")
            sents = generate(sequence, pred, target, wrapper)
            for sent in sents[:DISPLAY_BATCHES]:
                print(sent)
            # Reactivate dropout
            model.train()

145it [00:45,  3.20it/s]


Epoch 0 train loss: 0.22401335331900366
Javon Thompson is believed to have died in December 2006 | . One Mind Ministries members prayed for his re | .ssedssssss tos . .
Mom discovers young boy's guinea pig has died su | ddenly . She decides to keep it a secret because he | . . . .,sedsssssss to


2it [00:02,  1.25s/it]


Valid loss: 0.20363730937242508
Smartwatches have hidden, darker side the companies selling | them are unlikely to talk about, says Andre Spicer . | .sssssssssssss
Bafetimbi Gomis collapses within 10 minutes of | kickoff at Tottenham . But he reportedly left the  | sss .sssssssssss


145it [00:43,  3.30it/s]


Epoch 1 train loss: 0.1914054452345289
Residents in Huntington, Virginia, voluntari | ly evacuating because of flooding . FEMA says no states | . .,ssssss The " "sss
96 Liverpool fans died as a result of the Hillsborough disaster on | April 15, 1989 . Fans were crushed against sta | . . . . .s Ass as a a a a


145it [00:45,  3.21it/s]


Epoch 2 train loss: 0.16946742514084126
Woman travels from Kenya to India for a tricky third kidn | ey transplant . Lorna Irungu suffers from l | ss C::ssalesingmal
British graphic artist's identity remains a mystery desp | ite huge popularity . Feted by the art world and Hollywood | essss . . The.s the the s the 


145it [00:45,  3.19it/s]


Epoch 3 train loss: 0.14704467232885032
Seminole Tribe owns Hard Rock properties, look | s to help unify Native Americans . The Native American Group | s of Mo coach coach coach mu . . . The Thes  are
Site provided names, IP and e-mail addresses | of offenders it found, AG says . Similar inform | ing youthing . . . . . says . Twoinginging Ma


145it [00:45,  3.17it/s]


Epoch 4 train loss: 0.12437003239475447
NEW: Friends, relatives of three missing men to organize private search | . Coast Guard suspends search off Florida coast at sundown Tuesday . | . NEW Guard rescueds rescued, rescuedton search Floridas Florida . .
The Beatles' remastered catalogue will be released in September . | Release includes all 12 Beatles albums and two later works . Beatles | rotess,,sss, Beatles,,,, Beatles


145it [00:44,  3.26it/s]


Epoch 5 train loss: 0.10707830098168604
Boy, 12, found .25-caliber gun in box in | closet . Police have not charged anyone in relation to incident . | closet . closet charge charge charged toone relat Nataed to incident .
Britain is ready to increase troop numbers in Afghanistan, arm | y chief says . Richard Dannatt told the Times up | y chiefy . Richard Richardish Dan Danish a a at a


145it [00:44,  3.25it/s]


Epoch 6 train loss: 0.09431973294965153
Obama sends nuclear agreement with the United Arab Emirates to the | Senate . Recently released video shows member of UAE royal | lyelylylylylylys tos U.EE
Russian gas giant Gazprom says it will cut gas supply | 10 a.m. Thursday  | 10 10 10....m...m...


145it [00:44,  3.29it/s]


Epoch 7 train loss: 0.0822273473801284
Congressman from Rhode Island seeks help against substance abuse . | Rep. Patrick Kennedy, 41, acknowledges long struggle with addiction | Rep: acknowledgek acknowledge Kennedy acknowledge, acknowledges need,d addiction
Kellogg's donated two tons of cer | eal to the San Francisco Food Bank . Company dumped the c | alalal San San Sanco Food Bank . Company dumped the c


145it [00:43,  3.35it/s]


Epoch 8 train loss: 0.07359302128697263
Paul McCartney, Ringo Starr say their music sound | s better . The former Beatles talk to CNN's Larry | story better . The 70former Photo talk of be's Larry
Russian gas giant Gazprom says it will cut gas supply | 10 a.m. Thursday  | a a a a. a am ammm.mm


145it [00:42,  3.39it/s]


Epoch 9 train loss: 0.06514370634124197
H.W. Brands: Roosevelt-Washington | White House dinner was controversial . He says the the opposition was about | dinner dinner dinner . dinner . controvers He says controvers controvers controverspposition was seek
Mark Penn is CEO of public relations giant Burson-Marsteller | . Penn met with Colombia ambassador over trade pact . Colombia fire | . Colombia met a Colombia ofmbassador a its Colombiaador of Colombian


145it [00:42,  3.38it/s]


Epoch 10 train loss: 0.060922711291189854
Barcelona are ready to let Brazilian international Ronaldinho leave the club | . Club president Joan Laporta says that Ronaldinho needs | . Hevent 1 saysa saystoers says that Ronalda thats
Essence magazine interviews senator, wife and children in Chicago | home . Editor says candidate, Michelle Obama try to keep daughter | follow . Cofftor says Michel, Michel,le should to keeple


2it [00:02,  1.23s/it]


Valid loss: 0.3429071456193924
Jina Krause-Vilmar: On International Women' | s Day, focus on helping refugee women adapt to new | day . North economy, 27, CESis, Council and
Parents of teens should have a criminal defense lawyer in mind  | -- just in case, says author . Author Lisa Green calls this the | . Pilots, 90ron, but was the the airc,


145it [00:43,  3.36it/s]


Epoch 11 train loss: 0.050278863978796995
At least 98 dead, many more missing, from flash | floods in Jakarta . Flood began after heavy rain | ining in 11karta . Froundus imping after rain rain
Pakistani President insists his state's nuclear arsenal | is safe, despite Taliban gains . Recent gains by the Taliban | deal safe deal CNNite desp .s . Recent dealed whe the U


145it [00:43,  3.35it/s]


Epoch 12 train loss: 0.04485224521365659
NEW: Sen. Obama says any recover plan must help workers and home | owners . McCain repeats misleading charge against Obama on taxes | owners . McCain repeats mist iting mordst it on taxes
International Criminal Court: Sudan "taken no steps" to | arrest Darfur war criminals . One suspect now in charge of humanitarian | charge shooting suspect .s . One suspect now charge charge of chargearian


145it [00:42,  3.38it/s]


Epoch 13 train loss: 0.0406720953640239
Troops to stay until 2011, with the stipulation that NATO | contribute more forces . Most of Canada's 2,500 troops | se more forces . weatherst of Canada's a,500 troops
"Several heavy metals" found in levels abo | ve safe drinking-water standards . TVA pledges cleanup; | ve drinking drinkingwaterwater standard . . TV . pledges cleanup;


145it [00:43,  3.37it/s]


Epoch 14 train loss: 0.03537089057780545
Iran set to send female athlete to next year's Winter Olympics for | the first time . One female skier, three male skiers will be | Darfur time time . One, skier, " male "ers, be
Ex-PM Thaksin Shinawatra convicted of corruption, sentenced | to jail . Court rules Thaksin facilitated wife's purchase | . jail . Court rules servein facilitdated wife ins purchase


145it [00:42,  3.42it/s]


Epoch 15 train loss: 0.03182108311046814
NEW: California Public Utilities Commission passed ban Thursday . | Phone records show engineer texting during work hours on | Phone records shows texts onring t ons on
The Rev. Jeremiah Wright seeks to explain the | ology of black church . Wright says criticisms come from those | ology of black church . Wright says criticisms know from hose


145it [00:42,  3.43it/s]


Epoch 16 train loss: 0.02937748265163652
 | new Terminal 5 . The new building took more than 15 | er Terminal 5 . The building building took more than 15
NEW: Gunfire from pirates forces sailors, who did not return | fire, to turn back . NEW: FBI launches criminal investigation into hijack | to, to turn back . NEW: FBI launches criminal investigation into hijack


145it [00:42,  3.39it/s]


Epoch 17 train loss: 0.025108223151544044
NEW: Blagojevich declines comment, says "I'm enjoying Dis | ney World with my kids" Blagojevich faces up to 20 years in prison | s World my my heart" Bara faces World to 20 prison to prison
NEW: All vehicles banned in city of Amara through Thursday, | provincial governor says . Death toll continues to rise as In | provincial governor says by Death toll continues toll for In


145it [00:42,  3.45it/s]


Epoch 18 train loss: 0.02279952210855895
David Cook became the seventh "American Idol" in the spring . | Cook wrote or co-wrote most songs on debut, even | Cook wrote or co-wroterote songs, debut on debut
Celebrated Nigerian author has been resident in New York for over 20 years | . Most famous work, "Things Fall Apart" tra | countr Most famous A, "This Fall Fall Fallpart Fall out


145it [00:42,  3.42it/s]


Epoch 19 train loss: 0.028387771970753012
Fighting comes day after Rwanda arrests Tutsi rebel leader, Lauren | t Nkunda . Neighboring nations have been on different | t .k N . Neighboring mans have been Shi different
Maryland couple operates Rude Ranch Animal Rescue out | of their home . Bob and Katherine Rude currently care | of their home . People and Kathdeine keepde inly in


145it [00:42,  3.40it/s]


Epoch 20 train loss: 0.022372872343864935
1 person dead, 116 ill in northeastern part of the state . | Focus of investigation is restaurant in Locust Gro | Fcc0pa investigation under under about Louump Gro
Heavyweight champ Jack Johnson was arrested around 1908 . His offense: | Transporting a white woman across state lines for "immoral | Transporting a white woman a line other lines for 40immoral


2it [00:02,  1.17s/it]


Valid loss: 0.43674537539482117
Actor Ashton Kutcher complained on Facebook that men' | s rooms don't have diapering tables . | ra hundreds out of the theft as and why inflation .
The abducted workers were seized in an attack on an oil field last | week . They are from the Philippines, Bangladesh, Austria, the | "gs "gight" Auduned to goet,"


145it [00:42,  3.44it/s]


Epoch 21 train loss: 0.01858798562838086
Ex-presidents of Mexico, Brazil and Colombia urge decriminal | ization of marijuana . Leaders urge treatment for addicts, | ization would marijuana . Leaders should treatment for addiction,
Organ shortage in rich states has created a trade from poorer | countries . "Transplant tourists" travel to poor countr | countries . "Transplant tourists" travel to poor countr


145it [00:42,  3.38it/s]


Epoch 22 train loss: 0.01736916198298849
Passenger questioned about large amount of cash . He says the money was | from sale of political merchandise . ACLU | from sale of political pmerleandisisis ACLU
Swat Valley region used to be a popular destination for tourists and skiers . | Taliban are imposing their strict brand of Islamic law in | Taliban are imposing their strict brand of Islamicc in


145it [00:42,  3.38it/s]


Epoch 23 train loss: 0.01890337569317941
Brittanee Drexel disappears on trip to Myrtle Beach, South Carolina | . Mom says she spoke to daughter but didn't know she was | . Mom says she spoke to daughter but didn't know she was
Nicole Suveges, 38, was part of team of a | cademic embeds advising military in Iraq . The political scientist | cademic embss advis military by killed Iraq . The political leader


145it [00:43,  3.37it/s]


Epoch 24 train loss: 0.018238013468939683
U.S. calls remarks vile, hateful, inc | iteful, praises U.N. condemnation . Dozens go | iteful, praises U.N. praiseation says Dozens go
Former FBI agent Bob Levinson disappeared in Iran in March 2007 . His | wife says she has done everything to draw attention to Levinson's | wife says she has a everything to draw attention to Levinson Jos


145it [00:42,  3.37it/s]


Epoch 25 train loss: 0.016980656300639285
Large Hadron Collider will have first attempt at circulating a | beam September 10 . It's the largest particle ac | beam September 10 . It's large largest bearticle ac
Randomly chosen, undecided Democrats will | watch debate while turning dials . Dials will rate | watch debate Howhile turning dial . . Dial poor will rate


145it [00:42,  3.39it/s]


Epoch 26 train loss: 0.01775279429195256
Cirque Du Soleil's "Kooza" | went looking for juggler with 11 world records | 1,went looker for juggler with Friday world records
Tamil rebels say Sri Lankan forces ignoring orders to | end military operations Sri Lanka: Forces ordered to cease | end military operations:ri Lanka: Forceseded message cease


145it [00:42,  3.38it/s]


Epoch 27 train loss: 0.012975201640149642
Celtic defeat rivals Rangers 2-0 to win Scottish League | Cup final at Hampden . Extra-time goals by | Cup final at Hampshir . Extra-time goals a
New digital topographic map reveals more of Earth than | ever before . Images were taken by Japanese imaging | ever before . Images were taken by Japanese imaging


145it [00:43,  3.35it/s]


Epoch 28 train loss: 0.010615310782630895
Pakistani Taliban's interpretation of sharia includes banning | girls from school . Deal with the Taliban comes after a visit | girl school schoolm school . Deal with " Taliban,s fro a visit
48-year-old Boy George on trial in London in November . Per | former faces charges of false imprisonment relating to 2007 incident . Georg | former faces imprison of false imprisonment .ing to 2007 incident . Georg


145it [00:43,  3.36it/s]


Epoch 29 train loss: 0.009465205194107417
Most common car safety system is ABS or anti-lock | brake system . EBD: subsystem of | brake system . EB:: subure systeme system of
More than 600 supporters watch their team suffer its sixth los | s in seven games . Energie Cottbus are second | s in seven games . Energie Cottbusive second


145it [00:42,  3.38it/s]


Epoch 30 train loss: 0.009462838530026633
Liu Yan is paralyzed while rehearsing her | solo dance for the Olympics' opening . Doctors say she will not | solo dance for the Olympics'sing . Doctors say she had be
Second-placed Ajax held 2-2 at home by V | itesse Arnhem in Dutch Eredivisie . | itesee Arnmm Dutch Dutch Eredivisie .


2it [00:02,  1.23s/it]


Valid loss: 0.5029298365116119
Red Bull's No.1 driver Daniel Ricciardo says Form | ula One is a "crueler sport" Last year | ation . Man was amost drop to those shot Monday, spokesman,
Jina Krause-Vilmar: On International Women' | s Day, focus on helping refugee women adapt to new | on curtenies, Chond . In0 have presented the


145it [00:43,  3.37it/s]


Epoch 31 train loss: 0.012501134479354168
NEW: Paper says rejection part of standard back-and-forth | procedure . John McCain writes essay in defending his Iraq policy | procedure . John McCain's essay  defending his Iraq policy
Suit alleges crew sent into pirate-infested waters without protection . | Representatives of ship's owners have yet to comment . | Representatives of ship's owners have yet to comment .


145it [00:42,  3.40it/s]


Epoch 32 train loss: 0.011527195906844633
Ruben Navarrette: McCain tried to get under Obama's skin with attack | s . Obama remained cool and wound up ending strongly in the | s . Obama remained for thing "ed ended strongs in
Lawyer who blamed Guatemala's president for two slayings was killed Sunday | . In video released after his death, lawyer says to blame president if | . In video,, his wife, lawyers he blame president if


145it [00:42,  3.41it/s]


Epoch 33 train loss: 0.01121688674621541
Pregnant woman reportedly found husband, other man dead in a | partment . Slain students were international Ph.D. candidate | partment . Slain .s were international Ph.D. candidate
Trump advises to take advantage of low prices, get seller | to do the financing . Trump: Obama rebuilding U. | to do the fillivo . Trump: Obama rebuilding U.


145it [00:42,  3.38it/s]


Epoch 34 train loss: 0.009480949645412379
Poland was ruled by Soviet-backed regime after the Second World War | . Solidarity movement became a key factor in the fall of | . Solidarity movement became a key factor in the fall of
Adam Sandler wore cast for part of "Bedtime S | tories" filming . Some scenes required a body double after star | tories" filming . Some scenes required a body double after star


145it [00:43,  3.35it/s]


Epoch 35 train loss: 0.012617782914432985
A good corn-gluten mix cures brown patch | es and weakens most fungi . Brown patches are often | es and weakens most fungi . Brown patches are often
"Slumdog Millionaire" wins eight Oscars, including best picture | and director . Sean Penn wins best actor for "Milk" Kat | and director . Sean Penn wins Rock actors "Myk" Kat


145it [00:42,  3.41it/s]


Epoch 36 train loss: 0.013486170826544021
Thousands of items sent from around globe are housed on Tech | campus . Items range from letters from kids | campus . Items range from letters from kids
NEW: Suspected spree killer described as meth addict | . Nicholas T. Sheley, 28, did not enter a | . Nichola Green T. Sheley due did with did not enter a


145it [00:42,  3.39it/s]


Epoch 37 train loss: 0.012369921341024596
Fire reported as plane goes off runway in freezing rain at Lub | bock, Texas, airport . Two crew members hospitalized with apparent | bock, Texas, Texas . Two crew members hospitalized with apparent
Rodent the size of small car discovered in Urugua | y . Scientists say the rodent must have weighed 1,000 | y . Scientists say the extagent must have weighed 1,000


145it [00:42,  3.42it/s]


Epoch 38 train loss: 0.008941682017055051
NEW: Workers release one of five people being held hostage at Ca | terpillar factory . Workers angry that Caterpillar | terpillter factory . Workers angry that Caterpillra
Authorities order evacuation of 8,000 people who live near Galeras volcano | . No injuries reported; volcano erupts for second time in less than a | . No injuries says; volcano erupts for second time in less than battle


145it [00:43,  3.37it/s]


Epoch 39 train loss: 0.01142408332048819
NEW: First United Nations group to address piracy meets in New York | . Two or three speedboats were chasing the Dutch contain | . Two or three speedboats were chasing the air contain
Director Antoine Fuqua is filmmaker behind CNN's "From | MLK to Today" Fuqua discusses life, experiences with | breaLK to Today" Fuqua discusses life, experiences with


145it [00:42,  3.38it/s]


Epoch 40 train loss: 0.009240936725560962
Tom Wilkerson: Pirates based in Somalia represent threat to crew | s and cargo . He says U.S. strategy so far has | s and cargo . He says U.S. far so far has
French FM Kouchner has told France to prepare for po | ssibility of war with Iran . Was a surprise appointment | ssualbility of war with Iran . Wasual surprisedment


2it [00:02,  1.27s/it]


Valid loss: 0.5354069173336029
Pope has talked of retirement before, but this time he says he | think papacy will end after no more than five years . Francis | 's family . Gen. Citepew about self,
Two American women arrested for carving initials into a Coloss | eum wall . Meanwhile, Egypt investigating Russian porn | dam Hand of an end . Oneic land in an au


145it [00:42,  3.39it/s]


Epoch 41 train loss: 0.008090863280512136
New Orleans elbowed New York aside as the | best city for fine dining . Portland, Oregon, swe | best city for fine dining . Portland, Oregon, swe
London football club Chelsea appoint Carlo Ancelotti as their | new manager . Ancelotti has been in charge of Italian giant | new manager . Ancels has been in charge of Italian giant


145it [00:42,  3.38it/s]


Epoch 42 train loss: 0.00644427841240219
Isaac Toussie was involved in mortgage scheme | in New York . Answers wanted about how thorough of an | in New York . Answers wanted about how thorough of an
Kabul hospital's operating budget is less than $1,200 a month . | Government pays salaries, but even basic supplies | Government pays salarar, but evenicbasic supplies


145it [00:42,  3.38it/s]


Epoch 43 train loss: 0.005888602222669227
Jessica Cox, 25, was born with no arms . Accusto | med to not having arms, she decided against using | med to not having arms, she decided against noting
Mexican media says local chief put under house arrest for 45 days . With Francis | co Velasco Delgado's removal, | co Velasco engagegad M's removal,


145it [00:42,  3.38it/s]


Epoch 44 train loss: 0.007377512388509409
Women older than 55 make up the fastest-growing age group | on Facebook . Expert says the site has hit a "tipp | on Facebook . Expert sai the site has hit a "Wive
Presley earned more than Justin Timberlake  | 44M | 44M,, Madonna proud$40M last last year . Cartoon


145it [00:42,  3.40it/s]


Epoch 45 train loss: 0.010811761856592935
NEW: The woman became ill on January 5 and died on Saturday . | NEW: Two tests on the woman are positive for H5N1  | NEW: Two tests on the woman are positive for H5N1 .
Attorney general: Deputy shot himself twice in the chin, once | in side of the head . Forensic examination on de | in side of the head . Forensic examin' on de


145it [00:42,  3.41it/s]


Epoch 46 train loss: 0.007335285280413668
Gulfport, Mississippi, mayor "will continue working to rebuild  | our city" Mayor Gregory Brent Warr, says he pleaded | our city" Mayor Gregory Brent Warr says says he pleaded
Chris Nowinski played football at Harvard, then wrestle | d professionally . Still suffers effects of six concus | d professionally . Still suffers effects of six conacceptus


145it [00:42,  3.40it/s]


Epoch 47 train loss: 0.0054973444869292195
Former FBI agent Bob Levinson disappeared in Iran in March 2007 . His | wife says she has done everything to draw attention to Levinson's | wife says she has done everything to draw attention to Levinson's
Palin meets with world leaders on sidelines of U.N. world | summit . Meeting with Kissinger stretches into an hour | summit . Meeting with Kisseer intoes into an hour


145it [00:42,  3.40it/s]


Epoch 48 train loss: 0.009034820230966755
NEW: Two Canadians, yachting tycoon, sister of | Bollywood actor among dead . Rabbi Gavri | Bolly, actor among dead in Rabbb Glyri
NEW: Barack Obama, John McCain lay roses at ground zer | o . New polls show both candidates with slim lead | o . New polls show both candidates in s inm lead


145it [00:42,  3.38it/s]


Epoch 49 train loss: 0.012979863185820908
A McDonald's in Washington goes silent as patro | ns watch inaugural . "This is America happening," | ns watch inaugural . "This is America happening""
Shaun Gopaul, Alex Jimenez served together in military | starting in 2005 . Gopaul: "He saved our | starting the 2005 . 48-pa says: "He saved theour


145it [00:42,  3.41it/s]


Epoch 50 train loss: 0.021682912869186236
Opposition parties seek to oust Prime Minister Stephen Harper's | government . Harper's Tories gained seats in Canada's | government . Harper's gainies gained seats in Canada's
Expert: Trial of Nazi war crimes suspect John Demjanjuk could be last of its | kind . Leading Nazis prosecuted at Nur | kind . Leading delays pros shouldcuted at Nur


2it [00:02,  1.23s/it]


Valid loss: 0.5322788059711456
A Civil War ironclad must be moved so the shipp | ing channel can be deepened . Wreckage lies a couple miles | . The practices that Manchester tied Trafford club for Argentin
Bafetimbi Gomis collapses within 10 minutes of | kickoff at Tottenham . But he reportedly left the  | y in Merrill Lynch politics . Worked by Japan to 27 coins


145it [00:59,  2.45it/s]


Epoch 51 train loss: 0.014196024886493025
Spain commemorates fifth anniversary of Madr | id train bombings . March 11, 2004, attacks killed 191 | demic train bombings . March 11, 2004, attack, 19 191
Bollywood superstar makes sporting debut as owner of Kolk | ata Knight Riders . Shah Rukh Khan still wildly | atu Knight wildiders . Shah Ruk wild Khan still wildly


145it [16:31,  6.84s/it]


Epoch 52 train loss: 0.006850670865769016
Rafael Nadal defeats David Ferrer in stra | ight sets to lift Barcelona Open title . World number one wins 6-2 | ight sets to lift Barcelona Open title . World number one of 6-2
Prince Harry describes Princess Diana as "the best mother in the | world" He asks for her to be remembered as "fun- | world" He asks for her to be remembered out "fun-


145it [15:46,  6.53s/it] 


Epoch 53 train loss: 0.004562510073120738
NEW: Opposition accuses PM Harper of putting his job ahead of Canada | 's interests . Move postpones opposition parties' | 's interests . Move postpones opposition parties'
Survey: 68 percent of U.S. hotels said they had energy | -efficient lights . InterContinental aims to cut energy | -efficient lights . InterContinental aims to cut energy


145it [00:42,  3.41it/s]


Epoch 54 train loss: 0.003947840365260069
White House says Obama is not going to make any decision that imper | ils safety . Senate votes 90-6 on a measure to | ils safety . Justicee votes 90-6 only measures
Analyst: Corporate lobbyists will not be only ones | heard by Obama's transition staff . Transition team' | heard by Obama's transition staff . Transition team'


145it [18:10,  7.52s/it]


Epoch 55 train loss: 0.004128364022371584
CNN's MainSail host Shirley Robertson is a double Olympic gold | medalist . Robertson has never climbed a big mast before | medalist . Indianson has never climbed a big mast before
Hillary Clinton owes $2.3 million in campaign debt from | failed 2008 presidential bid . Her campaign committee reported having $2.6 | failed 2008 presidential bid . Her campaign committee reported having $2.6


145it [16:44,  6.93s/it]


Epoch 56 train loss: 0.004081584033074564
Nations hoping for deal in which Libya would compensate terrorism victims . | Eights acts would be covered in possible agreement . Libya has | Eights acts would be covered in possible agreement . Libya has
Debra Lafave's probation forbids her | to have contact with anyone under 18 . Florida's Corrections Department | to have contact under anyone under 18 . Florida's Corrections Department


145it [00:44,  3.23it/s]


Epoch 57 train loss: 0.004743582007057708
White House spokeswoman Dana Perino gave last news briefing Friday . | Perino, 36, succeeded Tony Snow, is only second | Perino, 36, succeeded Tony Snow, is only second
Government may use eminent domain to seize land needed for | 9 | 91111 memorial . Hijacked United Flight 93 crashed


145it [14:50,  6.14s/it]


Epoch 58 train loss: 0.006751429034268548
Tino Schaedler, colleagues talk about NAU, | a remote collaboration project . They discuss the concept of | TV remote collaboration project . They discuss the concept of
Some 200 Palestinian refugees from Iraq will go to Iceland, Sweden . | 2,300 Palestinians "are living in desperate | 2,300ars "are living in d this


145it [00:55,  2.63it/s]


Epoch 59 train loss: 0.006731619769386177
Harry serves in British Army and spent 10 weeks in Afghanistan this | year . Decision was made to pull prince from Afghanistan a | year . Decision made made to pullcknit from Afghanistan a
Indian army troops use helicopters, boats to deliver supplies, rescue village | rs . NEW: Survivor says he lost wife, children, parents, nin | rs . NEW: Survivor says he lost wife, children, children, nin


145it [00:45,  3.18it/s]


Epoch 60 train loss: 0.0060081466004766266
China extends deadline for Internet filtering software to accompan | y PCs . International backlash, unreadiness of | s PCs . International backlash, unreadiness of
Old dance hall one of few underground gay clubs in Sha | nghai for older generation . Caters for a differen | nghai for older physicalration . Caters for differen differen


2it [00:02,  1.21s/it]


Valid loss: 0.5874077379703522
Media coverage of ISIS could spur "real | wave of Islamophobia," report's edit | , for a grown from tran's debate . Mayor Lev
Interim report exposes delays and inaction after MH3 | 70 disappeared . Flight carrying 239 people and crew has | medale on six . Cupuritbur scored by


145it [03:43,  1.54s/it]


Epoch 61 train loss: 0.006889753351951467
Government gives First Secretary Mark Sullivan 48 hours to | leave the country . Sullivan is accused of meddling in | rican the country . Sullivan is accused Somali medd in
A car with a cheaper sticker price can often cost consumers | more in the long run . Report based on comparison | Hillsmore in Mogadishu long run . Report based on comparison


145it [16:23,  6.78s/it] 


Epoch 62 train loss: 0.013508581334789252
Businessman Blake Jones seeks to develop renewable energy sources . Jones | ' passion for solar sparked a radical career shift during | ' passion for solar sparked a radical career shift during
Sixteen U.S. embassies in Europe receive mail containing white powder | . Tests show powder harmless in 15 cases; results pen | . Washingtons show powder harmless in 15 cases; results pen


145it [18:02,  7.47s/it] 


Epoch 63 train loss: 0.010697818528218515
Peter Bregman: In the boom, people sacrific | ed their joys for more money . He says bey | ed the jo joys for more money . He says ty
Jackson family has been in spotlight along with Michael . Father  | Joseph was a sometimes rough taskmaster . Sibling | Joseph was a sometimes rough taskmaster . Sibling


145it [17:38,  7.30s/it] 


Epoch 64 train loss: 0.009684301604484689
University of Memphis basketball has brought together whites, | blacks, fan says . Memphis team has a chance to do | blacks, fan says . Memphist team has a full to do
Axed Chrysler dealership holds central place in tiny | Georgia crossroads . Generations of families have purchased vehicle | Georgia crossroads . Lerations of families have purchased vehicle


145it [00:44,  3.28it/s]


Epoch 65 train loss: 0.00993766696352897
Immigration raid led to detention of 28 people in Washington state . | Raid was first of its kind under the new Obama administration . Homeland | Raid was first of its kind under the Obama Mar administration . Homeland
New Yorker article says Congress authorized up to $400 million for cover | t ops in Iran . Journalist Seymour Hersh says | tsifs in Iran . Journalist Sey Womanour Her hav says


145it [00:57,  2.53it/s]


Epoch 66 train loss: 0.007584977988153696
NEW: British counter-terrorism experts re-in | spect Bhutto's vehicle . NEW: Bhutto's assassination was her own | spect Bhutto's vehicleing NEW: Bhutto wass assassination was her own
NEW: "There's no cause of panic," says New Del | hi police spokesman . Security raised at all the nation's airports | hi police spokesman . Security raised at all the nation's airports


145it [16:49,  6.96s/it]


Epoch 67 train loss: 0.006443165361495882
Newer Acura MD | SUV, but it's a deluxe version . BM | SUV, but it's a deluxe version . BM
Aung San Suu Kyi says she's committed to pursuing | a dialogue with the ruling junta . Suu Kyi meets with three | a dialogue with the ruling junta . Suu Kyi meets with three


145it [17:25,  7.21s/it]


Epoch 68 train loss: 0.004655806566909726
Seizures in teenagers can be caused by dozens of disorders | , genetics . Seizure said to have caused the death of | , genetics . Seizure said to have caused the death of
CNN journalist in Denver, Colorado, weighs in on how economy affect | ing city . UPS store owner says business down at least 25 | ing city . UPS store owners business down at least 25


145it [18:20,  7.59s/it]


Epoch 69 train loss: 0.006930545105695211
Michael Lamar strikes an Obama-like figure in ads for | a bank in Turkey . Lamar worked for JP Morgan Chas | a bank in Turkey . Lamar worked for JP Morgan Chas
Heidi Newfield is up for five Academy of Country Music A | wards . Newfield pursuing solo career after being lead singer | wards . Newfield pursuing solo career after leading lead singer


145it [18:23,  7.61s/it]


Epoch 70 train loss: 0.00577883155342063
Australian Tim Cahill denies Liverpool win with late goal as Ever | ton draw 1-1 . Liverpool had grabbed the lead when Steven | ton draw 1-1 . Liverpool had grabbed the lead when Steven
More than 340 people quarantined in Hong Kong follow | ing single case of H1N1 . Those isolated due to | ing single case of H1N1 . Hohose isolated due to


2it [00:02,  1.18s/it]


Valid loss: 0.6070117354393005
Smartwatches have hidden, darker side the companies selling | them are unlikely to talk about, says Andre Spicer . | s . Car-Rahalons infects in UA
Timothy Stanley: GOP senators' letter to Iranian leaders seem | s extraordinary . But undermining a president's foreign policy | some their police . Court: Police: Ga police the two politician politician


145it [16:31,  6.84s/it]


Epoch 71 train loss: 0.007406085699090156
Mom thinks girl was abused while in the care of a baby | sitter, attorney says . Mother had no idea daughter had been abuse | sitter, attorney says . Mother had no idea daughter had been abuse
Expert: Many looking for happiness don't realize they | already have it . Funeral director finds happiness in job | already have it . Funeral director finds happiness in job


145it [18:08,  7.50s/it] 


Epoch 72 train loss: 0.006344721427765386
Chris Shurn served four years in San  | . Shurn earned a GED and nearly completed | . Hehurn earned a fullE complete and nearly completed
Scotland Yard releases report into assassination of Benazir Bhutto . On | ly apparent injury was a major trauma to the right side of the | ly apparent injury was a major traums to the right side of the


145it [18:25,  7.62s/it] 


Epoch 73 train loss: 0.005207455572511615
David Hawkins: Admission tests are wrongly used to | rank college quality . Hawkins says Baylor University's in | rank college quality . Hawkins says Baylor University's in
Officials, academics gather to discuss North Korea's energy need | s . Diplomacy was part of the discussion at Georgia | s . Diplomacy was part of the discussion at Georgia


145it [00:42,  3.40it/s]


Epoch 74 train loss: 0.005376541828094371
Begala: McCain's VP choice unqualified to be heart | beat from the presidency . Choice of Alaska Gov. | beat from the presidency . Choice su Alaska Gov.
Pakistan: Fighting began after Indian soldiers crossed the Line of Con | trol . India accused Pakistan of attacking one of its patrol | trol . India accused Pakistan of attacking one of its patrol


145it [15:44,  6.51s/it]


Epoch 75 train loss: 0.004152925698696797
Sheikha Lubna was first female minister in the United Arab Emirates . | Openness to foreign ownership is the "natural path | Openness to foreign ownership is the "natural path


145it [11:42,  4.85s/it]


Epoch 76 train loss: 0.004000388553912013
NEW: Stephen Morgan, 29, arrested in university student's | shooting death . Suspect's sister earlier urged him to turn himself | shooting death . E's sister earlier urged him to turn himself
University students at ETH Zurich have designed and built a | robotic boat . The boat needs no sailors and uses  | robotic boat . The boat needs no sailors and uses 


145it [16:09,  6.68s/it]


Epoch 77 train loss: 0.005044551196122735
Bernard Kerik is accused of failing to report more than $500 | ,000 in income . Kerik is the former New York City police | ,000 in in Afghanistan . Kerik is the former New York City police
Sri Lankan team agreed to replace India in Pakistan after Mumbai attack | s . International teams have long expressed concern about security in Pakistan . | s . International teams have long expressed concern about security in Pakistan .


145it [16:21,  6.77s/it]


Epoch 78 train loss: 0.0038044377398709283
Justice Ruth Bader Ginsburg expects to return to the be | nch by February 23 . The only woman on the Supreme Court | nch by life 23 . Command only woman on the Supreme Court
Italian mountaineer Reinhold Messner has inspired others to seek | adventure . Psychologist Dr. James Thompson says adventurers tend | adventure . Psychologist Dr. James Thompson says adventurers tend


145it [00:42,  3.40it/s]


Epoch 79 train loss: 0.002624312272809189
Richard Cooey, set to die Tuesday, has exhausted | most state, federal appeals . Justices are expected to decide | most state, federal appeals . Justices are expected to decide
Analysts say the race will be tight . Some say the Hezbollah- | dominated alliance may win a parliamentary majority . Turn | dominated alliance may win a parliamentary majority . "urn


145it [18:19,  7.58s/it]


Epoch 80 train loss: 0.005865099048241973
NEW: NTSB says "very little" remains found in wreckage . NTSB | : It could take "weeks, perhaps months" to | : It could take "weeks,hahaps months" to
No one submitted minimum $3.2 million bid for Michael | Vick's mansion . Vick, former quarterback of the | Vick's mansion . Vick, married quarterback of the


2it [00:02,  1.22s/it]


Valid loss: 0.6297488808631897
Parents of teens should have a criminal defense lawyer in mind  | -- just in case, says author . Author Lisa Green calls this the | C . Ecks, who was in exposure also and accompani
Media coverage of ISIS could spur "real | wave of Islamophobia," report's edit | ," for December . Judge he could lifts join spread its


145it [17:13,  7.13s/it]


Epoch 81 train loss: 0.0051807226275960945
We ask, should humanity expand into space | Space movement say space is humanity's future . But others say | Space movement say space is humanity's future . But others say
Terkel won Pulitzer Prize in 1985 for book about World War | II, "The Good War" Son: "My dad led | II, "The Good War" Son: "My dad led


145it [00:45,  3.22it/s]


Epoch 82 train loss: 0.00429867510202116
President Bush plans to ask Congress to lift offshore drilling ban Wednesday | . McCain says he opposes ban; states should decide . Curr | . McCain oppose he opposes ban; states should decide . Curr
"Watchmen" premiered in more theaters than any other R | -rated movie in history . Tyler Perry's "Mad | -rated movie in history . Tyler Perry "s "Mad early


145it [15:56,  6.60s/it]


Epoch 83 train loss: 0.004926049444374853
Actress Natasha Richardson fell on a beginners' trail in  |  | ueuebec, Canada . Actress had by "visible
Anh "Joseph" Cao is first Vietnamese-American elected to | the U.S. House . Cao, of Louisiana, defeated | the U.S. House . Cad, of Louisiana, defeated


145it [00:45,  3.19it/s]


Epoch 84 train loss: 0.004546508877056426
Most common car safety system is ABS or anti-lock | brake system . EBD: subsystem of | brake system . EBD: subsystemial
When a business scam fails, it tends to fail in | rather grand fashion . 360 B.C. scam: S | rather grand fashion . 360 .. scam. scam: S


145it [16:10,  6.69s/it]


Epoch 85 train loss: 0.005084355458906242
Ukrainian-born John Demjanjuk loses an appeal to avoid deportation . German | authorities seek him for alleged involvement in Nazi camp killings . The retire | authorities seek him for alleged involvement in Nazi to ins . The murder
London football club Chelsea appoint Carlo Ancelotti as their | new manager . Ancelotti has been in charge of Italian giant | new manager . Anselotti has been in charge of Italian giant


145it [16:22,  6.78s/it] 


Epoch 86 train loss: 0.007442781810873541
Jet's left engine fell off as plane traveled between 25 | 0 and 300 kmh, reports say . No traces of | 0 and 300 kmh, reports say . No traces of
Laborers in Palisades Park say two days of | work at $90 | work at $90mostday is a  . . They hope


145it [03:48,  1.58s/it]


Epoch 87 train loss: 0.008169963888438611
Thousands of items sent from around globe are housed on Tech | campus . Items range from letters from kids | campus . Items range from letters from kids
A special memorial has marked the 20th anniversary of Hills | borough tragedy . 96 Liverpool supporters died in crush at | borough orough . 96 Liverpool supporters died in crush at


145it [15:53,  6.58s/it] 


Epoch 88 train loss: 0.005615780794800356
Woman travels from Kenya to India for a tricky third kidn | ey transplant . Lorna Irungu suffers from l | ey transplant . Lorna Irungu suffers from l
Richardson's death raises question: When should you go to E | R after head injury | R after head injury,zzzziness, vomitings


145it [00:43,  3.35it/s]


Epoch 89 train loss: 0.004985615005716681
Casey Anthony's defense team has big challenge, experts | say . Lack of cause of death, physical evidence could hinder prosecution | say . Lack of cause of death, physical evidence could hinder prosecution
FIFA will announce its Player of the Year for 2008 in Zurich Monday evening | . Cristiano Ronaldo has been tipped as a favorite to win | . Cristiano Ronaldo has been tipped as a favorite to win


145it [16:24,  6.79s/it]


Epoch 90 train loss: 0.007083743043115427
Carly Fiorina: It's understandable that people are outraged | at Wall St. pay . Fiorina says government shouldn't set | at Wall St. pay . Fiorina says government shouldn't set
Ten-minute message delivered by Ayman al-Zawahiri, a | native of Egypt . In address, al-Zawahiri says violence a " | native of Egypt . Inever, who-Zawahiri a violence a "


2it [00:02,  1.26s/it]


Valid loss: 0.644891619682312
Smartwatches have hidden, darker side the companies selling | them are unlikely to talk about, says Andre Spicer . | s . White House: President Bush-goal Presidenth re politically
Indian broadcaster Puthiya Thalaimurai d | rew protests for a show about traditional necklaces worn | . don't, his mother, don't, his mother .


145it [16:24,  6.79s/it]


Epoch 91 train loss: 0.006986500144582884
Habitual felon likely to avoid charges in presumed murders | of four people . Scott Kimball's plea deal includes revealing | of four people . Scott Kimball's plea deal includes revealing
Janelle Monae popping up on commercial, on tour | , on TV . Singer has alter ego: Cindi Maywea | on on TV ship Singer has alter ne: Cind: Maycrafta


145it [17:07,  7.09s/it]


Epoch 92 train loss: 0.005264191368016704
President Cristina Fernandez de Kirchner says addicts should not | be persecuted . Argentine government wants Congress to pass | be persecuted . Argentine government wants Congress to pass
Anti-election demonstrators, Indian security forces clash during state | elections . Police say they used batons to restore order, and | elections . S say they used batons to Islamore orders and


145it [15:51,  6.56s/it]


Epoch 93 train loss: 0.004917647458356002
Vimlendu takes a group of children to see a village | in Uttaranchal . His aim was to show them the differen | in Uttaranchal . His aim was to show them the differen
Eight Florida teens to be tried as adults in videotaped beating case | . Video shows 16-year-old girl punched by | . Video shows 16-year-old girl punched by


145it [12:34,  5.20s/it] 


Epoch 94 train loss: 0.00309387323735603
Alleged pirate known in official documents as "Pirate Defendant" Alleg | ed pirate brought to Djibouti aboard the USNS | ed pirate brought to Djibouti aboard the USNS
Larry Zeiger moved from Brooklyn to Miami | in 1957 in pursuit of a radio hosting gig . | in 1957 in pursuo of a radio .ing gig .


145it [00:45,  3.20it/s]


Epoch 95 train loss: 0.0036947479054074864
"Flashpackers" are professionals older than 30 who | prefer hostels to hotels . Hostels in New York, | preser hostels to hotels . Hostels in New York,
Harrison's star next to Hollywood headquarters of Capitol | Records . Harrison, who died of cancer in 2001, is second | Records . Harrison, who died of cancer in 2001, is second


145it [00:43,  3.37it/s]


Epoch 96 train loss: 0.002400477565343267
Andy Roddick forced to retire from his  | 's Club semifinal with ankle injury . The injury means | 's Club semifinal with ankle injury . The injury means
First Principal Voices round table debate takes place in Doha | ,  | , thatar . Olafur Grimsson states "it


145it [00:43,  3.30it/s]


Epoch 97 train loss: 0.002656286178509994
Compact-camera makers are testing the waters with high- | end features like GPS . Premium features like high-speed | end features like GPS . Premier features like high-speed
The company has become a huge name in communications in just 20 | years .  | years . moreualcomm has a portfolio 10 app


145it [00:42,  3.41it/s]


Epoch 98 train loss: 0.0026482165935609874
Dharavi, in the heart of Mumbai, is one of the | biggest slums in the world . The massive redev | biggest s s in the world . The massive redev
Justice Department must decide whether to pursue investigation . Former agent | John Kiriakou talked about interrogation technique on TV | John Kiriakou talked about interrogation technique on TV


145it [00:42,  3.42it/s]

Epoch 99 train loss: 0.004156963520779692
Ruben Navarrette: Hate crimes should be punished more severely . | He says hate crimes terrorize society as a whole . Navarrette | He says hate a terrorize society as a who call . Navarrette
Toobin: Ruling falls in line with recent court decision | s on death penalty . Justice Kennedy, in decision, said | s on death penalty . Justice Kennedy, in decision, said





In [11]:

from torchinfo import summary

print(summary(model))

Layer (type:depth-idx)                   Param #
Transformer                              5,121,024
├─ModuleList: 1-1                        --
│    └─Dropout: 2-1                      --
│    └─Dropout: 2-2                      --
├─ModuleList: 1-2                        --
│    └─DecoderBlock: 2-3                 --
│    │    └─MultiHeadAttention: 3-1      589,824
│    │    └─ModuleList: 3-2              --
│    │    └─ModuleList: 3-3              2,048
│    │    └─SwiGLU: 3-4                  3,145,728
│    └─DecoderBlock: 2-4                 --
│    │    └─MultiHeadAttention: 3-5      589,824
│    │    └─ModuleList: 3-6              --
│    │    └─ModuleList: 3-7              2,048
│    │    └─SwiGLU: 3-8                  3,145,728
│    └─DecoderBlock: 2-5                 --
│    │    └─MultiHeadAttention: 3-9      589,824
│    │    └─ModuleList: 3-10             --
│    │    └─ModuleList: 3-11             2,048
│    │    └─SwiGLU: 3-12                 3,145,728
│    └─DecoderBlock

In [12]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(activities=[ProfilerActivity.CPU], record_shapes=True, ) as prof:
    model(sequence.to(DEVICE), prev_target.to(DEVICE))

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

STAGE:2023-01-30 07:12:17 1952:4665970 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-01-30 07:12:17 1952:4665970 ActivityProfilerController.cpp:300] Completed Stage: Collection


TypeError: Transformer.forward() takes 2 positional arguments but 3 were given

In [None]:
from einops import rearrange

x = torch.rand(4, 3)
x[torch.tensor([0,1,0])]