# Retail Saarthi SLM 

- This notebook contains us making a custom SLM for our final year Project 

## Step 1 : Load the Dataset 

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

# 1. Define your file list based on the uploaded files
file_paths = [
    "SLM Training Dataset/Identity Dataset.csv",
    "SLM Training Dataset/Retail Term web dataset.csv",
    "SLM Training Dataset/Govt Act Data.csv",
    "SLM Training Dataset/Retail Comperhensive dataset.csv",
    "SLM Training Dataset/Audio Dataset.csv"
]

all_texts = []

# 2. Iterate through files and aggregate the 'text' column
print("Loading local datasets...")
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)
        # Ensure the 'text' column exists
        if 'text' in df.columns:
            # Drop any empty rows in the text column
            cleaned_texts = df['text'].dropna().tolist()
            all_texts.extend(cleaned_texts)
            print(f"Loaded {len(cleaned_texts)} examples from {file_path}")
        else:
            print(f"Warning: No 'text' column found in {file_path}")
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

print(f"Total examples loaded: {len(all_texts)}")

# 3. Create a Hugging Face Dataset
full_dataset = Dataset.from_dict({"text": all_texts})

# 4. Split into Train (80%) and Validation (20%) sets
# We use a seed for reproducibility
split_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)

# 5. Rename 'test' to 'validation' to match the notebook's expected structure
ds = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})

print("Dataset ready for tokenization:")
print(ds)

  from .autonotebook import tqdm as notebook_tqdm


Loading local datasets...
Loaded 1500 examples from SLM Training Dataset/Identity Dataset.csv
Loaded 66 examples from SLM Training Dataset/Retail Term web dataset.csv
Loaded 228 examples from SLM Training Dataset/Govt Act Data.csv
Loaded 500 examples from SLM Training Dataset/Retail Comperhensive dataset.csv
Loaded 93 examples from SLM Training Dataset/Audio Dataset.csv
Total examples loaded: 2387
Dataset ready for tokenization:
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1909
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 478
    })
})


## Step 2 : Tokenize the dataset 

In [5]:
import os 
import tiktoken
import numpy as np
from tqdm.auto import tqdm

# We will be using the 'gpt2' BPE tokenizer for this step as it is industry standard and mentioned in our reference [TinyStories Paper]

tokenizer = tiktoken.get_encoding("gpt2")

# Defining a preprocessing function to tokenize the text and convert it into token IDs
def process(example,tokenizer = tiktoken.get_encoding("gpt2")
):
    ids = tokenizer.encode_ordinary(example["text"])
    out = {"ids":ids,"len":len(ids)}
    return out

#Apply the processing function to the entire dataset
print("Tokenizing the dataset...")
tokenized=ds.map(
    process,
    remove_columns=['text'],
    desc="Running tokenizer on dataset",
    num_proc=4,
)

for split,dset in tokenized.items():
    arr_len = np.sum(dset['len'],dtype=np.uint64)
    filename = f'{split}.bin'

    dtype = np.uint16 ## As gpt2 bpe tokenizer has a vocab size of 50257, uint16 can easily accomodate it.

    # Create a memory-mapped array on disk
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))

    # To accomodate our small dataset [Temporary]
    total_batches = min(1024, len(dset)) 
    if total_batches < 1:
        total_batches = 1

    idx = 0
    
    print(f"Writing {filename}...")
    for batch_idx in tqdm(range(total_batches), desc=f'Writing {filename}'):
        # Batch together samples for faster write
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        
        # Write into mmap
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    
    # Flush changes to disk
    arr.flush()
    print(f"Saved {filename} with {arr_len} tokens.")



Tokenizing the dataset...


Running tokenizer on dataset (num_proc=4): 100%|██████████| 1909/1909 [00:12<00:00, 147.55 examples/s]
Running tokenizer on dataset (num_proc=4): 100%|██████████| 478/478 [00:11<00:00, 41.23 examples/s]


Writing train.bin...


Writing train.bin: 100%|██████████| 1024/1024 [00:02<00:00, 496.54it/s]


Saved train.bin with 157916 tokens.
Writing validation.bin...


Writing validation.bin: 100%|██████████| 478/478 [00:00<00:00, 543.46it/s]

Saved validation.bin with 40050 tokens.





## STEP 3 - Creating input output Pairs 

In [6]:
import torch
import numpy as np 

# Config 
BATCH_SIZE = 32
BLOCK_SIZE = 128
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device_type = 'cuda' if DEVICE == 'cuda' else 'cpu'

print(f"Device: {DEVICE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Block Size: {BLOCK_SIZE}")

def get_batch(split):
    if split == 'train':
        data = np.memmap('train.bin', dtype=np.uint16, mode='r')
    else:
        data = np.memmap('validation.bin', dtype=np.uint16, mode='r')
    
    ix = torch.randint(len(data)-BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([torch.from_numpy(data[i:i+BLOCK_SIZE]) for i in ix])
    y = torch.stack([torch.from_numpy(data[i+1:i+BLOCK_SIZE+1])for i in ix ])

    if device_type == 'cuda':
        x,y = x.pin_memory().to(DEVICE, non_blocking=True), y.pin_memory().to(DEVICE, non_blocking=True) 
    else:
        x,y = x.to(DEVICE), y.to(DEVICE)
    return x,y   

Device: cuda
Batch Size: 32
Block Size: 128
