Data import

In [53]:
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

In [60]:
class TransformerDataSet:
    def __init__(self, hugging_face_name:str, sequence_length:int) -> None:
        self.sequence_length = sequence_length
        self.laoded_text = self.load_text(hugging_face_name)
        self.tokenizer = self.generate_token()
        self.word_sequences = self.tokenizer.texts_to_sequences(self.laoded_text)[0]
        self.vocab_size = max(self.word_sequences)
        self.number_of_sequences, self.data_split = self.create_splits(self.sequence_length)
        self.inputs, self.outputs = self.add_target_variables()

    def load_text(self, hugging_face_name: str) -> list[str]:
        dataset = load_dataset(hugging_face_name)
        all_training_text = dataset["train"]["text"]
        sentences = all_training_text
        return sentences
    
    def generate_token(self):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.laoded_text)
        return tokenizer  
    
    def create_splits(self, sequence_length: int):
        number_of_sequences = len(self.word_sequences) // self.sequence_length
        splits_sequence = [self.word_sequences[i* sequence_length: (i+1) * sequence_length] for i in range(number_of_sequences)]
        return number_of_sequences, splits_sequence
    
    def add_target_variables(self):
        input = [seq[:-1] for seq in self.data_split]
        ouptut = [seq[1:] for seq in self.data_split]

        return np.array(input), np.array(ouptut)


In [61]:
def pad_list(input_list, max_size):
    list_size = len(input_list)
    if list_size >= max_size: return input_list[:80]
    padding_count = max_size - list_size
    padding_list = [0] * padding_count 
    return input_list + padding_list

In [68]:
class TransformerDataSetText:
    def __init__(self, file_path:str, sequence_length:int) -> None:
        self.sequence_length = sequence_length
        self.laoded_text = self.load_text(file_path)
        self.tokenizer = self.generate_token()
        self.word_sequences = self.tokenizer.texts_to_sequences(self.laoded_text)
        self.vocab_size = len(self.tokenizer.word_index) + 1
        self.number_of_sequences, self.data_split = self.create_splits(self.sequence_length)
        self.inputs, self.outputs = self.add_target_variables()

    def load_text(self, file_path: str) -> list[str]:
        text_file = open(file_path, 'r')
        all_text = text_file.readlines()
        all_text_formated = [line_str.replace("\"", "") for line_str in all_text]
        all_text_formated = all_text_formated[: len(all_text_formated) //2] #this will need to be removed !!
        #return [pad_list(x, 80) for x in all_text_formated]
        
        return all_text_formated
    
    def generate_token(self):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.laoded_text)
        return tokenizer  
    
    def create_splits(self, sequence_length: int):
        number_of_sequences = len(self.laoded_text)
        splits_sequence = [pad_list(x, 80) for x in self.word_sequences]
        return number_of_sequences, splits_sequence
    
    def add_target_variables(self):
        input = [seq[:-1] for seq in self.data_split]
        ouptut = [seq[1:] for seq in self.data_split]


        return np.array(input), np.array(ouptut)

In [69]:
#data_object = TransformerDataSet("tiny_shakespeare", 80)


file_path_for_text = r"C:\Users\willi\Desktop\Data Science Portfolio\shakespearegen\data\raw\archive\alllines.txt"
data_object = TransformerDataSetText(file_path_for_text, 80)

In [70]:
print(data_object.vocab_size)

18887


In [59]:
#data_object.inputs.shape

(55698, 79)