In [1]:
%pip install lightning -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir('/content/drive/MyDrive/S22')

In [4]:
import torch
torch.cuda.is_available()

True

In [5]:
import glob
import math
import sys
import time
from pathlib import Path
from typing import Optional, Tuple, Union

import lightning as L
import torch
from lightning.fabric.loggers import CSVLogger
from lightning.fabric.strategies import FSDPStrategy
from torch.utils.data import DataLoader

# # support running without installing as a package
# wd = Path(__file__).parent.parent.resolve()
# sys.path.append(str(wd))

from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset
from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
from tsai_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters, load_checkpoint

In [6]:
model_name = "pythia-160m"
name = "redpajama"
out_dir = Path("out") / name
save_interval = 1000
eval_interval = 1000
eval_iters = 100
log_interval = 100

In [7]:
# Hyperparameters
learning_rate = 6e-3
batch_size = 8 #32
micro_batch_size = 4 #8
gradient_accumulation_steps = batch_size // micro_batch_size
assert gradient_accumulation_steps > 0
#max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices
max_iters = 60000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
decay_lr = True
warmup_iters = 2000
lr_decay_iters = max_iters
min_lr = 6e-6

In [8]:
# Data proportions from https://arxiv.org/pdf/2302.13971.pdf Table 1
data_config = [
    ("arxiv", 2.5),
    ("book", 4.5),
    ("c4", 15.0),
    ("cc", 67.0),
    ("github", 4.5),
    ("stackexchange", 2.0),
    ("wikipedia", 4.5),
]

In [7]:
hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)


def setup(
    devices: int = 4,
    train_data_dir: Path = Path("data/redpajama_sample"),
    val_data_dir: Optional[Path] = None,
    precision: Optional[str] = None,
    resume: Union[bool, Path] = True,
) -> None:
    precision = precision or get_default_supported_precision(training=True)

    if devices > 1:
        strategy = FSDPStrategy(
            auto_wrap_policy={Block},
            activation_checkpointing_policy={Block},
            state_dict_type="full",
            limit_all_gathers=True,
            cpu_offload=False,
        )
    else:
        strategy = "auto"

    fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger)
    fabric.print(hparams)
    fabric.launch(main, train_data_dir, val_data_dir, resume)

In [10]:
model_copy = None

In [11]:
def main(fabric: L.Fabric, train_data_dir: Path, val_data_dir: Path, resume: Union[bool, Path]) -> None:
    global model_copy
    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")

    if fabric.global_rank == 0:
        out_dir.mkdir(parents=True, exist_ok=True)

    config = Config.from_name(model_name)

    train_dataloader, val_dataloader = create_dataloaders(
        batch_size=micro_batch_size,
        block_size=config.block_size,
        fabric=fabric,
        train_data_dir=train_data_dir,
        val_data_dir=val_data_dir,
        seed=(1337 + fabric.global_rank),
    )
    if val_dataloader is None:
        train_dataloader = fabric.setup_dataloaders(train_dataloader)
    else:
        train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader)

    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)

    fabric.print(f"Loading model with {config.__dict__}")
    t0 = time.perf_counter()
    import torch
    import torch.nn as nn
    def _init_weights(module: nn.Module) -> None:
            """Meant to be used with `gpt.apply(gpt._init_weights)`."""
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    with fabric.init_module(empty_init=True):
        model = GPT(config)
        model.apply(_init_weights)
    model.apply(_init_weights)


    checkpoint_path = Path("out/redpajama/iter-023999-ckpt.pth")

    load_checkpoint(fabric, model, checkpoint_path)

    print(model.transformer.h[0].mlp.fc.weight)

    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.")
    fabric.print(f"Total parameters {num_parameters(model):,}")

    model = fabric.setup(model)
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), foreach=False
    )

    # model_copy = model

    optimizer = fabric.setup_optimizers(optimizer)

    state = {"model": model, "optimizer": optimizer, "hparams": hparams, "iter_num": 0, "step_count": 0}

    if resume is True:
        resume = max(out_dir.glob("*.pth"), key=lambda p: int(p.name.split("-")[1]))
    if resume:
        fabric.print(f"Resuming training from {resume}")
        fabric.load(resume, state)

    train_time = time.perf_counter()
    train(fabric, state, train_dataloader, val_dataloader, speed_monitor)
    fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
    if fabric.device.type == "cuda":
        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")



In [12]:
def train(
    fabric: L.Fabric,
    state: dict,
    train_dataloader: DataLoader,
    val_dataloader: DataLoader,
    speed_monitor: SpeedMonitorBase,
) -> None:
    model = state["model"]
    optimizer = state["optimizer"]

    if val_dataloader is not None:
        validate(fabric, model, val_dataloader)  # sanity check

    with torch.device("meta"):
        meta_model = GPT(model.config)
        # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
        estimated_flops = estimate_flops(meta_model) * micro_batch_size
        fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
        x = torch.randint(0, 1, (micro_batch_size, model.max_seq_length))
        measured_flops = measure_flops(meta_model, x)
        fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
        del meta_model, x

    total_lengths = 0
    total_t0 = time.perf_counter()

    for state["iter_num"], train_data in enumerate(train_dataloader, state["iter_num"]):
        if state["iter_num"] >= max_iters:
            checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
            fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
            fabric.save(checkpoint_path, state)
            break

        # determine and set the learning rate for this iteration
        lr = get_lr(state["iter_num"]) if decay_lr else learning_rate
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr

        iter_t0 = time.perf_counter()

        input_ids = train_data[:, 0 : model.max_seq_length].contiguous()
        targets = train_data[:, 1 : model.max_seq_length + 1].contiguous()

        is_accumulating = (state["iter_num"] + 1) % gradient_accumulation_steps != 0
        with fabric.no_backward_sync(model, enabled=is_accumulating):
            logits = model(input_ids)
            loss = chunked_cross_entropy(logits, targets, chunk_size=0)
            fabric.backward(loss / gradient_accumulation_steps)

        # return

        if not is_accumulating:
            fabric.clip_gradients(model, optimizer, max_norm=grad_clip, error_if_nonfinite=False)
            optimizer.step()
            optimizer.zero_grad()
            state["step_count"] += 1

        t1 = time.perf_counter()
        total_lengths += input_ids.size(1)
        speed_monitor.on_train_batch_end(
            (state["iter_num"] + 1) * micro_batch_size,
            t1 - total_t0,
            # this assumes that device FLOPs are the same and that all devices have the same batch size
            fabric.world_size,
            flops_per_batch=measured_flops,
            lengths=total_lengths,
        )
        if state["iter_num"] % log_interval == 0:
            fabric.print(
                f"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, LR: {lr:.6f}, iter time:"
                f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
            )

        if val_dataloader is not None and not is_accumulating and state["step_count"] % eval_interval == 0:
            t0 = time.perf_counter()
            val_loss = validate(fabric, model, val_dataloader)
            t1 = time.perf_counter() - t0
            speed_monitor.eval_end(t1)
            fabric.print(f"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
            fabric.barrier()
        if not is_accumulating and state["step_count"] % save_interval == 0:
            checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
            fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
            fabric.save(checkpoint_path, state)

In [13]:
@torch.inference_mode()
def validate(fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader) -> torch.Tensor:
    fabric.print("Validating ...")
    model.eval()

    losses = torch.zeros(eval_iters, device=fabric.device)
    for k, val_data in enumerate(val_dataloader):
        input_ids = val_data[:, 0 : model.max_seq_length].contiguous()
        targets = val_data[:, 1 : model.max_seq_length + 1].contiguous()
        logits = model(input_ids)
        losses[k] = chunked_cross_entropy(logits, targets, chunk_size=0)
    out = losses.mean()

    model.train()
    return out

In [14]:
def create_dataloader(
    batch_size: int, block_size: int, data_dir: Path, fabric: L.Fabric, shuffle: bool = True, seed: int = 12345
) -> DataLoader:
    datasets = []
    for prefix, _ in data_config:
        filenames = glob.glob(str(data_dir / f"{prefix}*"))
        dataset = PackedDataset(
            filenames,
            n_chunks=4,
            block_size=block_size,
            shuffle=shuffle,
            seed=seed,
            num_processes=fabric.world_size,
            process_rank=fabric.global_rank,
        )
        datasets.append(dataset)

    if not datasets:
        raise RuntimeError(
            f"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset."
        )

    weights = [weight for _, weight in data_config]
    sum_weights = sum(weights)
    weights = [el / sum_weights for el in weights]

    combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights)

    return DataLoader(combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [15]:
def create_dataloaders(
    batch_size: int,
    block_size: int,
    fabric: L.Fabric,
    train_data_dir: Path = Path("data/redpajama_sample"),
    val_data_dir: Optional[Path] = None,
    seed: int = 12345,
) -> Tuple[DataLoader, DataLoader]:
    # Increase by one because we need the next word as well
    effective_block_size = block_size + 1
    train_dataloader = create_dataloader(
        batch_size=batch_size,
        block_size=effective_block_size,
        fabric=fabric,
        data_dir=train_data_dir,
        shuffle=True,
        seed=seed,
    )
    val_dataloader = (
        create_dataloader(
            batch_size=batch_size,
            block_size=effective_block_size,
            fabric=fabric,
            data_dir=val_data_dir,
            shuffle=False,
            seed=seed,
        )
        if val_data_dir
        else None
    )
    return train_dataloader, val_dataloader

In [16]:
def get_lr(it: int) -> float:
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [17]:
torch.set_float32_matmul_precision("medium")
setup(
    devices=1,
    train_data_dir=Path("data/lit-redpajama-sample")
)

INFO: Using 16-bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16-bit Automatic Mixed Precision (AMP)


{'model_name': 'pythia-160m', 'name': 'redpajama', 'save_interval': 1000, 'eval_interval': 1000, 'eval_iters': 100, 'log_interval': 100, 'learning_rate': 0.006, 'batch_size': 8, 'micro_batch_size': 4, 'gradient_accumulation_steps': 2, 'max_iters': 60000, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'decay_lr': True, 'warmup_iters': 2000, 'lr_decay_iters': 60000, 'min_lr': 6e-06}


INFO: Seed set to 1337
INFO:lightning.fabric.utilities.seed:Seed set to 1337


Loading model with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m-deduped'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}
Parameter containing:
tensor([[ 0.2005, -0.1592,  0.0184,  ..., -0.0983, -0.0924,  0.2875],
        [ 0.0377, -0.3791,  0.2150,  ..., -0.1997, -0.1751,  0.2146],
        [ 0.0358,  0.1995,  0.1779,  ...,  0.0920, -0.1158,  0.0171],
        ...,
        [ 0.0291, -0.0804,  0.2391,  ...,  0.1027, -0.0795, -0.2908],
        [ 0.2615, -0.1459, -0.2251,  ..., -0.0171,  0.1957,  0.3512],
        [ 

KeyboardInterrupt: ignored

In [8]:
fabric = L.Fabric(devices=1, strategy='auto', precision=None, loggers=logger)
fabric.print(hparams)
checkpoint_path = Path("out/redpajama/iter-031997-ckpt.pth")
config = Config.from_name(model_name)
model = GPT(config)

load_checkpoint(fabric, model, checkpoint_path)

#print(model.transformer.h[0].mlp.fc.weight)

{'model_name': 'pythia-160m', 'name': 'redpajama', 'save_interval': 1000, 'eval_interval': 1000, 'eval_iters': 100, 'log_interval': 100}


In [9]:
def generate( model, config, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.

        """
        idx = idx.unsqueeze(dim=0)
        for _ in range(max_new_tokens):
            # print("config blc sz : ",config.block_size )
            # print("idx" ,idx)
            # #print("idx size", idx.size(0), idx.size(1))
            # # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= config.block_size else idx[ :,-config.block_size:]
            # forward the model to get the logits for the index in the sequence

            logits = model(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [10]:
%pip install huggingface_hub sentencepiece -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.3 MB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m0.9/1.3 MB[0m [31m13.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
import os
import pickle
from contextlib import nullcontext
import torch
#import tiktoken
import glob
import math
import sys
import time
from pathlib import Path
from typing import Optional, Tuple, Union
import typing_extensions
import lightning as L
from lightning.fabric.loggers import CSVLogger
from lightning.fabric.strategies import FSDPStrategy
from torch.utils.data import DataLoader
import torch.nn.functional as F

# # support running without installing as a package
# wd = Path(__file__).parent.parent.resolve()
# sys.path.append(str(wd))

from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset
from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
from tsai_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters, load_checkpoint
#import gradio as gr

from tsai_gpt.tokenizer import Tokenizer
checkpoint_dir = Path('./checkpoints/meta-llama/Llama-2-7b-chat-hf')
token = Tokenizer(checkpoint_dir = checkpoint_dir)

def tsaigpt(start:str , model= model, max_new_tokens = 300, num_samples =2, tokeniser= token):

  # -----------------------------------------------------------------------------

  temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
  top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
  seed = 1337
  device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
  dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
  compile = False # use PyTorch 2.0 to compile the model to be faster
  #exec(open('configurator.py').read()) # overrides from command line or config file
  # -----------------------------------------------------------------------------

  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
  torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
  device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
  ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
  ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

  model.eval()
  model.to(device)
  if compile:
      model = torch.compile(model) # requires PyTorch 2.0 (optional)

  start_ids = tokeniser.encode(start).to(device)
  #x = torch.tensor(start_ids, dtype=torch.long, device=device).clone().detach()

  # run generation
  with torch.no_grad():
      with ctx:

              y = generate(model =model, config =config ,  max_new_tokens = max_new_tokens, idx = start_ids ,temperature=1.0, top_k=None)
              #print(decode(y[0].tolist()))
              output = tokeniser.decode(y[0])
  return output


Prediction 1 (max _token = 300)

In [13]:
tsaigpt("its very cold outside you should take a ")

"its very cold outside you should take a 3,000 slides, so why not take a deep one.\nHi a nice to see for it to all his rooms! speak on me and your door -''\nTurchased\nCEISPR. (cs)\nPrepare all things you do before your windows. You can please help and ease each other is applied. Some of the best surge jobs we are at isling strap tomorrow!\n .We’re excited by therho logger and have it all worked on consequently to clean codes, comment your occupation, all the tares and all the glitter for the back.itten. Please instantly waste into the status of your home for yourAnimation and superriching of 70 pence. Hound-to-the little door will be greedy for short inter О Пресле, right? Let’s get ready for any post and check out the churches’ unified (be suitable for anyRemoteIgnore or service take- is always more hidden) – at adequately Erskine Bluff piano lenses! See the wedding idiot at thedoor vault next!».urrence overlapped today.\nIt is not, however, possible Afotosecond’s tag to Thank You – 

Prediction *2* (max _token = 200)

In [14]:
tsaigpt("In this age of hiring and firing.. ", max_new_tokens  = 200)

"In this age of hiring and firing.. 1988, sliding, sold, chained, squeezed and chair rise to the blades totalling his choice!Future speaks at and smans -''\nTurchased Father who argued it was a muscle impartial to his disliked television cast, Emily Hary. He did this as a painter to pupil chairs. He produced this strange Latino-like law staple and steamed . He became the only fan to bring all sides to him by a smart move-turned ride, then held by all of the tares to the fort. She orchestrated hisitten like a tint and watched singing, and touched his heart upon the supernaturalive of his face and paved him-balling for dismay them. Their souls went off to his О Пресле, and disliked his drums, keeping him in his hand. The fan looked un laugh again. First at him, his rub"

Prediction 3 (max _token = 200)

In [15]:
tsaigpt("Shakespeare is a legend ", max_new_tokens  = 200)

"Shakespeare is a legend 1988 annual slideshow sold, chained $1,200-$1,300 totalling $1,500.\nSince she's extremely often been a Father ofStatus, she has probably served as a hot cellist and vice-chairwoman. She relishes readersruitan, is a pupil chairs and she's very fast. When we're not tomorrow, we are sitting in the gym -rhom and lunch. Sometimes we are properly brought and heard, then sall all of the tares and all of the halt orchestralesitten.\nAnd I reminded her how to interpret books from characters as broken or romantic are not only evil but shame for the most disrespectrist of souls but for spite of О Пресле, and these books can be due to the thrilling and an incongruous one-range (being me). I wish I"

Prediction 4 (max _token = 200)

In [16]:
tsaigpt("Wikis are enabled by wiki software, otherwise known as wiki engines. A wiki ", max_new_tokens  = 200)

'Wikis are enabled by wiki software, otherwise known as wiki engines. A wiki 3,000 slides, sold, chained, sd produce the wiki rise to the blades totalling his own words, speaks at and smans - did not have placards or other. The wiki mus Ezra and his cellular jets are castmarks of this version of Airgate and Saudi Arabia.\nshort chairs and other arizen jobs are kept at night while staring tomorrow when undeclaced exhibits by otheragogmants who serve as a wiki stone to clean their train. The wiki pai tares to alienate the orchestralgiaitten like in the early Antarctic, and the wiki pai famosa that been 7–2–6-6-8. Narrows will learn wisdom from its harps О Пресле, and these wiki pai tuliera)$.\nThis version of Airgate is made (bears at crossRemote.org'