<a href="https://colab.research.google.com/github/PersephoneKarnstein/terf-gen/blob/master/Terfy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install keras_nlp
!pip install nltk



In [4]:
#https://stackabuse.com/gpt-style-text-generation-in-python-with-tensorflowkeras/

import os, glob, keras_nlp, nltk.data, random,warnings
import tensorflow as tf
from tensorflow import keras
from keras.models import model_from_json
import numpy as np

nltk.download('punkt', quiet=True)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings("ignore", category=UserWarning)

In [6]:

def get_corpus_data():
	path = "/content/sample_data"
	files = glob.glob(path + '/*.txt')
	data = ""
	# files = [files[1]] #delete this line, this is just for testing
	for f in files:
		data += open(f).read()
	return data

texts = get_corpus_data()


In [7]:

# console.print("[pink1]Decimating...")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text_list = tokenizer.tokenize(texts)
text_list = list(filter(None, text_list))

random.shuffle(text_list)

length = len(text_list)
text_train = text_list[:int(0.7*length)]
text_test = text_list[int(0.7*length):int(0.85*length)]
text_valid = text_list[int(0.85*length):]


In [8]:
from tensorflow.keras.layers import TextVectorization

def custom_standardization(input_string):
    sentence = tf.strings.lower(input_string)
    sentence = tf.strings.regex_replace(sentence, "\n", " ")
    return sentence


In [9]:
maxlen = len(max(text_list))

vectorize_layer = TextVectorization(
    standardize = custom_standardization,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

vectorize_layer.adapt(text_list)
vocab = vectorize_layer.get_vocabulary()
vocab_size = len(vocab)

index_lookup = dict(zip(range(len(vocab)), vocab))
# index_lookup[5]

batch_size = 64


In [10]:

train_dataset = tf.data.Dataset.from_tensor_slices(text_train)
train_dataset = train_dataset.shuffle(buffer_size=256)
train_dataset = train_dataset.batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices(text_test)
test_dataset = test_dataset.shuffle(buffer_size=256)
test_dataset = test_dataset.batch(batch_size)

valid_dataset = tf.data.Dataset.from_tensor_slices(text_valid)
valid_dataset = valid_dataset.shuffle(buffer_size=256)
valid_dataset = valid_dataset.batch(batch_size)


In [11]:
def preprocess_text(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y



In [12]:
train_dataset = train_dataset.map(preprocess_text)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

test_dataset = test_dataset.map(preprocess_text)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

valid_dataset = valid_dataset.map(preprocess_text)
valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)


In [13]:
embed_dim = 128
num_heads = 4

def create_model():
    inputs = keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    x = keras_nlp.layers.TokenAndPositionEmbedding(vocab_size, maxlen, embed_dim)(inputs)
    for i in range(4):
        x = keras_nlp.layers.TransformerDecoder(intermediate_dim=embed_dim*2, num_heads=num_heads, dropout=0.5)(x)
    do = keras.layers.Dropout(0.4)(x)
    outputs = keras.layers.Dense(vocab_size, activation='softmax')(do)
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer="adam",
        loss='sparse_categorical_crossentropy',
        metrics=[keras_nlp.metrics.Perplexity(), 'accuracy']
    )
    return model


In [14]:

model = create_model()
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 56)]              0         
                                                                 
 token_and_position_embeddin  (None, 56, 128)          8482816   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, 56, 128)          132480    
 ormerDecoder)                                                   
                                                                 
 transformer_decoder_1 (Tran  (None, 56, 128)          132480    
 sformerDecoder)                                                 
                                                                 
 transformer_decoder_2 (Tran  (None, 56, 128)          132480

In [15]:
class TextSampler(keras.callbacks.Callback):
    def __init__(self, start_prompt, max_tokens):
        self.start_prompt = start_prompt
        self.max_tokens = max_tokens
    def sample_token(self, logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)
    def on_epoch_end(self, epoch, logs=None):
        decoded_sample = self.start_prompt
        for i in range(self.max_tokens-1):
            tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
            predictions = self.model.predict([tokenized_prompt], verbose=0)
            sample_index = len(decoded_sample.strip().split())-1
            sampled_token = self.sample_token(predictions[0][sample_index])
            sampled_token = index_lookup[sampled_token]
            decoded_sample += " " + sampled_token
        print(f"\nSample text:\n{decoded_sample}...\n")


In [16]:
# First 5 words of a random sentence to be used as a seed
random_sentence = ' '.join(random.choice(text_valid).replace('\n', ' ').split(' ')[:4])
sampler = TextSampler(random_sentence, 30)
reducelr = keras.callbacks.ReduceLROnPlateau(patience=10, monitor='val_loss')



In [20]:

def sample_token(logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

def generate_text(prompt, response_length=20):
    decoded_sample = prompt
    for i in range(response_length-1):
        tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
        predictions = model.predict([tokenized_prompt], verbose=0)
        sample_index = len(decoded_sample.strip().split())-1
        sampled_token = sample_token(predictions[0][sample_index])
        sampled_token = index_lookup[sampled_token]
        decoded_sample += " " + sampled_token
    return decoded_sample

def save_model(model):
	# serialize model to JSON
	model_json = model.to_json()
	with open("/content/models/model.json", "w") as json_file:
		json_file.write(model_json)
	# serialize weights to HDF5
	model.save_weights("/content/models/model.h5")
	# print("Saved model to disk")

def load_model():
    path = os.getcwd()
    # with redirect_stdout(open(os.devnull, 'w')):
    json_file = open("/content/models/model.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights("/content/models/model.h5")
    loaded_model.compile(
        optimizer="adam",
        loss='sparse_categorical_crossentropy',
        metrics=[keras_nlp.metrics.Perplexity(), 'accuracy'])
    print("Loaded model from disk")
    return loaded_model


In [18]:

model = create_model()
history = model.fit(train_dataset,
                    validation_data=valid_dataset,
                    epochs=50,
                    callbacks=[sampler, reducelr])


Epoch 1/50
Sample text:
It practically writes itself.  to a same woman   in the lot  to a same same same the lot and be be woman to same be have a woman and...

Epoch 2/50
Sample text:
It practically writes itself.   to be not been a own man in the woman of the lot  of their woman is a woman and the same man is a man...

Epoch 3/50
Sample text:
It practically writes itself. to have a same or an sex reassignment as if it is that the same is the woman and i had a few and i think that i had...

Epoch 4/50
Sample text:
It practically writes itself. and i think that i was not just just like you are a few or in the same or not a lot and a woman who were so that...

Epoch 5/50
Sample text:
It practically writes itself. and i have a few months on a lot to the same people are a woman.  for the most of my body is the same people who had...

Epoch 6/50
Sample text:
It practically writes itself.  ) , in this has always a new wave feminism was in the transsexual empire.   and a new zealand in a man

In [21]:
save_model(model)

In [74]:
generate_text('the person I was when')


'the person I was when the same way that i am at a girl who went to a girl at the time. : 4).'