In [2]:
# Import necessary libraries
from datasets import load_dataset

# Load the Wikitext dataset from Hugging Face
# Need to specify a config name as shown in the error message
wikitext_dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

# Display basic information about the dataset
print(f"Dataset structure: {wikitext_dataset}")
print(f"Available splits: {wikitext_dataset.keys()}")

# Display a sample from the train split
print("\nSample from train split:")
print(wikitext_dataset["train"][0])


Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 161691.83 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 456093.79 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 345770.29 examples/s]

Dataset structure: DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
Available splits: dict_keys(['test', 'train', 'validation'])

Sample from train split:
{'text': ''}





In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the Zephyr model and tokenizer
model_name = "HuggingFaceH4/zephyr-7b-beta"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device
)

print(f"Loaded {model_name} model on {device}")
print(f"Model has {sum(p.numel() for p in model.parameters())/1e9:.2f} billion parameters")


Loading checkpoint shards: 100%|██████████| 8/8 [00:03<00:00,  2.56it/s]


Loaded HuggingFaceH4/zephyr-7b-beta model on cuda
Model has 7.24 billion parameters


In [10]:
import os 
import json

class JsonlDataset():
  def __init__(self, tokenizer, tokenizer_max_length, batch_size, min_len, dataset_name, dataset_folder, device):
    self.tokenizer = tokenizer
    if tokenizer.pad_token is None:
      tokenizer.pad_token = tokenizer.eos_token

    self.tokenizer_max_length = tokenizer_max_length
    self.batch_size = batch_size
    self.min_len = min_len
    self.dataset_name = dataset_name
    self.dataset_folder = dataset_folder
    self.data = []
    self.device = device

  def __getitem__(self, idx):
    item = self.data[idx]
    input_ids = self.tokenizer(item["text"], return_tensors="pt", padding=True, truncation=True, max_length=self.tokenizer_max_length)
    inputs = {key: value.to(self.device) for key, value in input_ids.items()}
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]}

  def _load_dataset(self):
    dataset_path = os.path.join(self.dataset_folder, self.dataset_name)
    if not os.path.exists(dataset_path):
      raise FileNotFoundError(f"Dataset file not found at {dataset_path}")
    
    data_list = []
    with open(dataset_path, "r") as f:
      for line in f:
        data = json.loads(line)
        if len(data["text"]) > self.min_len:
          data_list.append(data)

    self.data = data_list
    
  def __len__(self):
    return len(self.data)


In [None]:
JsonlDataset

In [13]:
test_input = wikitext_dataset["train"][5]["text"]
input_ids = tokenizer(test_input, return_tensors="pt", padding=True, truncation=True, max_length=1024)["input_ids"]


 It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . 

