In [18]:
import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import AutoTokenizer

In [22]:
tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')

def tokenize(doc):
    tokens = tokenizer.encode(doc, add_special_tokens=False)
    tokens_np = np.array(tokens)
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16


def read_data(docs):
    lines = []
    i=0
    while i<len(docs)-1:
      mesr1 = docs[i].replace('\n', '')
      mesr2 = docs[i+1].replace('\n', '')

      if len(mesr1)==0:
        i+=1
        continue

      if mesr1.isspace()  or mesr1==" " or len(mesr1)<4 or mesr2.isspace()  or mesr2==" " or len(mesr2)<4:
        print("there is a problem, please check it.")
        i+=1
        continue
      line = ' [BOM] '+ mesr1 +' [BOM] '+ mesr2 +'[EOS]'+'\n'
      lines.append(line)

      i+=2
    return lines



In [23]:
import re
data = []
def clear_text(address):
  fp = open(address)
  list_sent = fp.readlines()
  for i in list_sent[1:]:
    i = i.replace('\u200c', ' ')
    i = i.replace('    ', ' ')
    i = i.replace('   ', ' ')
    i = i.replace('  ', ' ')
    data.append(i)
  return data


data1 = clear_text('/content/eynolghozat_cleaned.txt')

data = data1
data[0:3]

['ز رویت می کند روشن خیالت چشم موسی را\n',
 'سحرگه عزم بستان کن صبوحی در گلستان کن\n',
 'به بلبل می برد از گل صبا صد گونه بشری را\n']

In [25]:
import numpy as np

clean_data = data
print("cleaned lines", len(clean_data))

docs_beit = read_data(clean_data)
print("number of beits :", len(docs_beit))
docs_beit[0:3]

cleaned lines 190423
there is a problem, please check it.
there is a problem, please check it.
there is a problem, please check it.
there is a problem, please check it.
there is a problem, please check it.
number of beits : 95209


[' [BOM] ز رویت می کند روشن خیالت چشم موسی را [BOM] سحرگه عزم بستان کن صبوحی در گلستان کن[EOS]\n',
 ' [BOM] به بلبل می برد از گل صبا صد گونه بشری را [BOM] کسی با شوق روحانی نخواهد ذوق جسمانی[EOS]\n',
 ' [BOM] برای گلبن وصلش رها کن من و سلوی را [BOM] گر از پرده برون آیی و ما را روی بنمایی[EOS]\n']

In [26]:
# split validation and train
len_train = int(len(docs_beit)*0.8)
data_train = docs_beit[0:len_train]
data_val = docs_beit[len_train:]
print(f"number of train is {len(x_train)} ----- number of validation is {len(x_val)}")

number of train is 76168 ----- number of validation is 19042


In [36]:

class DataLoaderLite:
    def __init__(self, B, T, data, process_rank,tokenizer):
        self.B = B
        self.T = T
        self.process_rank = process_rank
        self.data = data  # full dataset of beit
        self.tokenizer = tokenizer  # A function to tokenize sentences
        self.reset()

    def reset(self):
        self.current_position = self.B * self.T * self.process_rank
        self.indices = list(range(len(self.data)))  # Indices of sentences
        print(len(self.data))

    def next_batch(self):
        B, T = self.B, self.T
        x_batch, y_batch = [], []
        tokens_batch = []

        while len(tokens_batch) < B * T + 1:  # Ensure we have enough tokens for B*T+1
            if self.current_position >= len(self.indices):
                self.reset()  # Shuffle and start over if we've processed all sentences

            sentence_idx = self.indices[self.current_position]
            sentence = self.data[sentence_idx]
            tokens = self.tokenizer(sentence)  # Tokenize the current sentence
            tokens_batch.extend(tokens)

            self.current_position += 1

        tokens_batch = tokens_batch[:B * T + 1]

        # Create x_batch and y_batch
        x_batch = []
        y_batch = []

        for i in range(B):
            start_idx = i * T
            x_batch.append(tokens_batch[start_idx:start_idx + T])
            y_batch.append(tokens_batch[start_idx + 1:start_idx + T + 1])

        x_tensor = torch.tensor(x_batch, dtype=torch.long)
        y_tensor = torch.tensor(y_batch, dtype=torch.long)
        return x_tensor, y_tensor


In [45]:
dataloader_train = DataLoaderLite(B=3, T=15, process_rank=0, data= data_train, tokenizer=tokenize)
dataloader_val = DataLoaderLite(B=2, T=20, process_rank=0, data= data_val, tokenizer=tokenize)

x_batch, y_batch = dataloader_train.next_batch()
print(x_batch)
print(y_batch)


76167
19042
tensor([[    7,   105,  4704,    52,   124,   673,  8680,   246,  2147,    53,
             7, 19956,  3612,  4140,   335],
        [20198,    46,  2393,   335,     9,     7,    48,  3245,    52,   689,
            50,   235,  3546,   606,   393],
        [ 4474,    53,     7,   269,    57,  2192,  1948,   906,  3783,  5476,
             9,     7,    62, 12260, 18407]])
tensor([[  105,  4704,    52,   124,   673,  8680,   246,  2147,    53,     7,
         19956,  3612,  4140,   335, 20198],
        [   46,  2393,   335,     9,     7,    48,  3245,    52,   689,    50,
           235,  3546,   606,   393,  4474],
        [   53,     7,   269,    57,  2192,  1948,   906,  3783,  5476,     9,
             7,    62, 12260, 18407,  1702]])


In [49]:
print(tokenizer.decode(x_batch[0]))
print(tokenizer.decode(y_batch[0]))

[BOM] ز رویت می کند روشن خیالت چشم موسی را[BOM] سحرگه عزم بستان کن
ز رویت می کند روشن خیالت چشم موسی را[BOM] سحرگه عزم بستان کن صبوحی
