In [1]:
import re
import pandas as pd
from datasets import load_dataset
import numpy as np
import tensorflow as tf
from tokenize import tokenize

#Load dataset
dataset = load_dataset("hate_speech_pl")
df = pd.DataFrame(dataset['train'])
hate_s = df[['text']]

# defining function for removing angle brackets
def remove_angle_brackets(text):
    a = re.compile("<.*?>")
    a = a.sub('', text)
    return a

text = hate_s['text'].apply(remove_angle_brackets)

#changing text to list of ints; char_level=True - every character as individual token
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(text)
max_id = len(tokenizer.word_index)
#All texts into one text and coding with tokenizer
text_list = "".join(text.values.flatten())
[encoded] = np.array(tokenizer.texts_to_sequences([text_list])) - 1

dataset_size = tf.shape(encoded)[0]
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))

batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)

dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]), num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.map(lambda x_batch, y_batch: (tf.one_hot(x_batch, depth=max_id), y_batch),
                      num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset hate_speech_pl (C:/Users/trabk/.cache/huggingface/datasets/hate_speech_pl/default/1.1.0/40101693880e807040a6b999faa7441cc231e95c81eefa5922ff4a76c8aa48fd)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 68.59it/s]


In [13]:
from tensorflow.keras.models import load_model

model = load_model('my_model.h5')

# dodaj nowe warstwy do istniejącego modelu
model.add(tf.keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id, activation='softmax')))

print(model.summary())

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_crossentropy'])

history = model.fit(dataset, epochs=50, steps_per_epoch=328)

model.save('my_model.h5')


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_10 (GRU)                (None, None, 128)         88704     
                                                                 
 gru_11 (GRU)                (None, None, 128)         99072     
                                                                 
 time_distributed_6 (TimeDis  (None, None, 101)        13029     
 tributed)                                                       
                                                                 
 gru_16 (GRU)                (None, None, 128)         88704     
                                                                 
 time_distributed_9 (TimeDis  (None, None, 101)        13029     
 tributed)                                                       
                                                                 
 gru_1 (GRU)                 (None, None, 128)        

KeyboardInterrupt: 

In [36]:
#1.62

In [7]:
#Generator tekstu
def preprocess(text):
    X = np.array(tokenizer.texts_to_sequences(text)) - 1
    return tf.one_hot(X, depth=max_id)

def next_char(text, temperature=1): #temperature - określa jak bardzo model powinien być skłonny do wybierania mniej prawdopodobnych znaków
    if len(text) < window_length - 1:
        text = text.rjust(window_length - 1)
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    rescaled_logits = tf.reshape(rescaled_logits, (1, max_id))  # Add this line
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

def complete_text(text, n_chars=500, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text
print(complete_text('a', temperature=0.6))



a  k  centit  dhode nawet w ogolicznych raz staranie karerowanych oraz okresowanych presze by na polaków problemowi i pod racji bardzo bo nie wiem  żydów w to na kąkana na  tle dlatego za rodzinów którzy przez  nie polakach samoobronił przez  ukraińców  żydów mają  niemcy był  niemcy na była zapomnenie wyrwucy to nie powiedza i społecznie w każde na terenach powiedziali na służenia  żydów  niemcy  do  rosjan mieł po polaków w okresie takie czemalnych do oddziała  nie znajdowania okresować w tej p
