In [1]:
# Append location to path to allow custom modules to be used.
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

In [2]:
import cgael
from cgael.models.SimpleColor import *
from cgael.models.extras.LanguageDiscriminator import *
from cgael.metrics import brevity

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layer

import pygad
import pygad.kerasga

import numpy as np
import pandas as pd

from PIL import Image




In [3]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

In [4]:
root_folder = os.path.join("C:",os.sep,"Users","nicho","PyProjects","CGAEL_Results")
weights_file = os.path.join(root_folder, "discriminator", "eng_L10_1.npy")
discrim = LanguageDiscriminatorModel(word_length=10, compile=False)
discrim.model.set_weights(np.load(weights_file, allow_pickle=True))




In [5]:
ts = cgael.LanguageTokenSet("CHAT", '-')

In [6]:
msg1 = ts.encode("CATCH THAT CAT", (4, 10))
msg2 = ts.encode("THAT CATH HAT", (4, 10))
msg3 = ts.encode("", (4, 10))
batch = tf.convert_to_tensor([msg1, msg2, msg3])
tf.print(batch, summarize=-1)


[[[1 3 4 1 2 0 0 0 0 0]
  [4 2 3 4 0 0 0 0 0 0]
  [1 3 4 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0]]

 [[4 2 3 4 0 0 0 0 0 0]
  [1 3 4 2 0 0 0 0 0 0]
  [2 3 4 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0]]

 [[0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0]
  [0 0 0 0 0 0 0 0 0 0]]]


In [11]:
discrim.calculate_loss(batch)

<tf.Tensor: shape=(), dtype=float32, numpy=0.28653055>

In [None]:
discrim.model(msg3)

In [None]:
a = tf.reshape(batch, (-1, tf.shape(batch)[-1]))
a

In [None]:
temp = tf.equal(tf.math.mod(tf.range(tf.shape(a)[-2]), tf.shape(batch)[-2]), 0)
temp

In [None]:
mask = tf.math.not_equal(a[:,0], 0)
mask

In [None]:
mask = tf.math.logical_or(mask, temp)
mask

In [None]:
values = tf.boolean_mask(a, mask, axis=0)
values

In [None]:
scores = discrim(values)
scores

In [None]:
avg = tf.reduce_mean(scores)
avg

In [None]:
1 - avg

In [None]:
def style_loss(discrim, data):
    # Reshape tensor so that it is a list of all words.
    # This is okay because they will all be averaged individually.
    x = tf.reshape(data, (-1, tf.shape(data)[-1]))
    # Remove all words that start with 0 from the list.
    msk = tf.math.not_equal(x[:,0], 0)
    x = tf.boolean_mask(x, msk, axis=0)
    # Prevent errors.
    if tf.equal(tf.size(x), 0):
        return tf.constant(1.)
    # Evaluate remaining words with discriminator.
    print(x)
    x = discrim(x)
    # Calculate the mean and present value as loss.
    return 1 - tf.reduce_mean(x)

style_loss(discrim, batch)

---

In [None]:
ts = cgael.LanguageTokenSet("CHAT", '-')

In [None]:
english_words = set(["A", "AT", "HA", "ACT", "CAT", "HAT", "CHAT", "TACT", "THAT", "CATCH", "HATCH"])

In [None]:
class LanguageDiscriminatorGenerator(keras.utils.Sequence):
    def __init__(self, tokens, real_words:list, encode_length:int, batch_size:int, batch_count:int, len_min:int=1, len_max:int=None):
        """
        Parameters
        ---
        tokens : cgael.LanguageTokenSet
            The language tokens to use for encoding and for the fake words.
        real_words : list(String) OR set(String)
            Words that you would like the discriminator to mark as real.
            These words should be representative of the style of the language; consider excluding outliers.
        encode_length : int
            How long a word should be encoded as. Will have the shape (encode_length,).
        batch_size : int
            How big a batch is.
        batch_count : int
            How many batches per generation.
        len_min : int
            The minimum length for a fake word.
            This value defaults to 1.
        len_max : int
            The maximum length for a fake word.
            This value defaults to the value of encode_length.
        """
        self.tokens = tokens
        self.word_list = list(real_words) # For random values (set is not subscriptable).
        self.word_set = set(real_words) # For a fast way to see if a word in real or not.
        self.encode_shape = (encode_length,)
        self.batch_size = batch_size
        self.batch_count = batch_count
        self.len_min = len_min
        self.len_max = encode_length if len_max is None else len_max
        
    def gibberish(self, length=None):
        """
        Generates a random sequence of letters that may or may not match up to a real word.
        
        Parameters
        ---
        length : int
            If supplied, it will generate a "word" of exactly that length.
            Otherwise, it will generate a "word" between self.len_min and self.len_max in length.
        """
        length = random.randint(self.len_min, self.len_max) if length is None else length
        return ''.join(random.choices(self.tokens.alphabet_tokens, k=length))
    
    def nonsense(self, length=None):
        """
        Generates a random sequence of letters that is never a real word (as provided).
        
        Parameters
        ---
        length : int
            If supplied, it will generate a "word" of exactly that length.
            Otherwise, it will generate a "word" between self.len_min and self.len_max in length.
        """
        while True:
            x = self.gibberish(length=length)
            if x not in self.word_set:
                return x
            
    # Required.
    def __len__(self):
        return self.batch_count

    # Required.
    def __getitem__(self, index=0):
        #print(f"[LanguageDiscriminatorGenerator.__getitem__] Called with index={index}.")
        
        ls_x = []
        ls_y = []
        
        def append_text(text, value):
            ls_x.append(self.tokens.encode(text, shape=self.encode_shape))
            ls_y.append(value)
        
        for _ in range(0, self.batch_size):
            if random.random() < 0.5: # 50% chance of real value:
                append_text(random.choice(self.word_list), 1)
            else: # 50% chance of fake value:
                append_text(self.nonsense(), 0)
        
        return tf.stack(ls_x), tf.stack(ls_y)

In [None]:
class LanguageDiscriminatorModel():
    def __init__(self, word_length, compile=True):
        self.word_length = word_length
        
        self.model = self._build_model()
        
        if compile:
            self.model.compile(
                loss = self._model_loss(),
                optimizer = self._model_optimizer(),
                metrics = self._model_metrics()
            )
        
    def _build_model(self):
        x = y = layer.Input((self.word_length,))
        y = layer.Reshape((self.word_length, 1))(y)
        y = layer.Conv1D(self.word_length, 5, padding="same", activation="relu")(y)
        y = layer.Dense(1, activation="relu")(y)
        y = layer.Reshape((self.word_length,))(y)
        y = layer.Dense(self.word_length, activation="relu")(y)
        y = layer.Dense(1, activation="sigmoid")(y)
        return keras.Model(x, y)
        
    def _model_loss(self):
        return keras.losses.BinaryCrossentropy(from_logits=False)
    
    def _model_optimizer(self):
        return keras.optimizers.Adam(0.001)
    
    def _model_metrics(self):
        return ["accuracy"]
    
    def train(self, training_generator:LanguageDiscriminatorGenerator, epochs:int):
        self.model.fit(training_generator, epochs=epochs)
        
    def __call__(self, data):
        return self.model(data)

In [None]:
gen = LanguageDiscriminatorGenerator(tokens=ts, real_words=english_words, encode_length=5, batch_size=16, batch_count=100)

In [None]:
discrim = LanguageDiscriminatorModel(5)

In [None]:
discrim.train(gen, 100)

In [None]:
discrim.model.summary()

In [None]:
filepath = os.path.join("C:",os.sep,"Users","nicho","PyProjects","CGAEL_Results","demo.npy")

In [None]:
np.save(filepath, np.array(discrim.model.get_weights(), dtype="object"))

In [None]:
discrim.model(ts.encode("TA", (1,5)))

In [None]:
for x in english_words:
    y = discrim.model(ts.encode(x, (1,5)))
    print(x, y.numpy())

In [None]:
for _ in range(20):
    x = gen.nonsense()
    y = discrim.model(ts.encode(x, (1,5)))
    print(x, y.numpy())

In [None]:
gen.__getitem__()

---

In [None]:
ts = cgael.LanguageTokenSet("CHAT", '-')

In [None]:
msg1 = ts.encode("CATCH THAT CAT", shape=(4,5))
msg1

In [None]:
msg2 = ts.encode("", shape=(4,5))

In [None]:
msg3 = ts.encode("CHAT AT HATCH HAT", shape=(4,5))
msg3

In [None]:
msg4 = ts.encode("ATATA TATAT ATATA TATAT", shape=(4,5))
msg4

In [None]:
batch = tf.convert_to_tensor([msg1, msg2, msg3, msg4])
batch

In [None]:
from cgael.metrics import brevity

In [None]:
brevity.simple_brevity(msg1, power=1.2)

In [None]:
msg1

In [None]:
def power_brevity(data, word_length_power=2, word_count_power=2):
    @tf.function
    def helper(mask):
        # STEP 1: WORD LENGTH POWER
        # Get the lengths of each word.
        x = tf.math.reduce_sum(mask, axis=-1)
        # Divide by maximum length of words, placing the function on the range [0, 1].
        x = tf.math.divide(x, data.shape[-1])
        # Apply word_length_power.
        x = tf.math.pow(x, word_length_power)
        
        # STEP 2: WORD COUNT POWER
        # Get the sum of each word score.
        x = tf.math.reduce_sum(x, axis=-1)
        # Divide by maximum number of words, placing the function on the range [0, 1].
        x = tf.math.divide(x, data.shape[-2])
        # Apply word_count_power.
        x = tf.math.pow(x, word_count_power)
    
        return x
    
    # Get binary mask of data.
    mask = tf.sign(data)
    sums = tf.math.reduce_sum(mask, axis=[-2, -1])
    results = tf.where(
        condition = tf.math.equal(sums, 0),
        x = tf.constant(1, dtype=tf.float64),
        y = helper(mask)
    )
    return tf.reduce_prod(results)
    
power_brevity(msg4, word_count_power=3)

---

In [None]:
lengths = tf.math.reduce_sum(tf.math.sign(batch), axis=-1)
lengths

In [None]:
scores = tf.math.reduce_sum(tf.math.reduce_sum(tf.math.sign(batch), axis=-1), axis=-1)
scores

In [None]:
totals = tf.math.reduce_sum(tf.math.sign(batch))
totals

In [None]:
area = tf.math.reduce_sum(tf.math.reduce_prod(batch.shape))
area

In [None]:
batch = msg1

In [None]:
b = tf.math.reduce_prod(tf.constant(batch.shape, dtype=tf.int64))
e = tf.math.reduce_sum(tf.math.sign(batch), axis=[-2, -1])
f = tf.math.equal(e, 0)
g = batch.shape[-2]*batch.shape[-1]
h = tf.where(f, g, tf.math.subtract(e,1))
i = tf.math.reduce_sum(h)
j = tf.math.divide(i, b)
print(j)

In [None]:
def simple_brevity(data):
    # Get the number of non-padded tokens for each entry of the batch.
    sums = tf.math.reduce_sum(tf.math.sign(data), axis=[-2, -1])
    # Calculate the score of each entry of the batch 'n' such that:
    # - if n == 0: maximum area of entry (worst possible score)
    # - else: n - 1 (for calibration purposes)
    # Remember: Golf rules; lower is better.
    scores = tf.where(
        condition = tf.math.equal(sums, 0),
        x = tf.constant(data.shape[-2] * data.shape[-1], dtype=sums.dtype),
        y = tf.math.subtract(sums, 1)
    )
    # Calculate the final loss by dividing sum of scores over maximum scores.
    total = tf.math.reduce_sum(scores)
    shape = tf.shape(data, out_type=sums.dtype)
    area = tf.math.reduce_prod(shape)
    loss = tf.math.divide(total, area)
    return loss
    
simple_brevity(batch)

In [None]:
a = totals.numpy()
print(a)
b = area.numpy()
print(b)
c = batch.shape
print(c)
#d = batch.shape[-3]
#print(d)
d = [-1, -2]
print(d)
e = tf.math.reduce_sum(tf.math.sign(batch), axis=d)
print(e)
f = tf.math.equal(e, 0)
print(f)
g = batch.shape[-2]*batch.shape[-1]
print(g)
h = tf.where(f, g, tf.math.subtract(e,1))
print(h)
i = tf.math.reduce_sum(h)
print(i)
j = tf.math.divide(i, b)
print(j)
#k = tf.math.reduce_prod(tf.where())
k = tf.math.mod(d, len(batch.shape))
print(k)

In [None]:
temp = keras.losses.MeanAbsoluteError()

In [None]:
temp(msg1, msg2)

---

In [None]:
mask = tf.math.sign(msg)
mask

In [None]:
tf.math.reduce_sum(mask, axis=-1)

---

In [None]:
ts = cgael.LanguageTokenSet("CHAT", '-')
gen = SimpleColorGenerator([Swatch.WHITE, Swatch.BLACK, Swatch.RED, Swatch.GREEN, Swatch.BLUE, Swatch.YELLOW, Swatch.CYAN, Swatch.MAGENTA], blur=0, batch_lock=True)
model = SimpleColorModel(token_set=ts, word_count=1, word_length=3)

In [None]:
ga_inst = model.train(
    generator = gen,
    generations = 100,
    num_solutions = 50,
    num_parents_mating = 5,
    mutation_percent_genes = .2,
)

In [None]:
np.save(filepath, np.array(model.model.get_weights(), dtype="object"))

# Evaluation

In [None]:
plot = ga_inst.plot_fitness(title=experiment_id, color="#0C69D3")
plot.savefig(os.path.join(root_folder, experiment_folder, f"{experiment_id}.png"))

In [None]:
KEY_SWATCH = "swatch"
KEY_TEXT = "text"
KEY_IN = "input"
KEY_OUT = "output"
KEY_ROUND = "output_rounded"

swatches = [Swatch.BLACK, Swatch.RED, Swatch.GREEN, Swatch.YELLOW, Swatch.BLUE, Swatch.MAGENTA, Swatch.CYAN, Swatch.WHITE]
samples = np.array([[sample_swatch(x)] for x in swatches])
lang, out = model.model(samples)
text = [ts.decode(x) for x in lang]

d = {x:[] for x in [KEY_SWATCH, KEY_TEXT, KEY_IN, KEY_OUT, KEY_ROUND]}
for s, t, i, o in zip(swatches, text, samples, out):
    d[KEY_SWATCH].append(s)
    d[KEY_TEXT].append(t)
    d[KEY_IN].append(i)
    o = o.numpy()
    d[KEY_OUT].append(o)
    o = np.round(o)
    d[KEY_ROUND].append(o)
df = pd.DataFrame(data=d)
display(df)

In [None]:
KEY_TEXT = "text"
KEY_OUT = "output"
KEY_ROUND = "output_rounded"

def generate_words(tokens, n):
    temp = tokens
    total = [''] + temp
    for _ in range(n-1):
        temp = [x+y for x in temp for y in tokens]
        total = total + temp
    return total

text = generate_words(ts.alphabet_tokens, 3)
data = np.array([ts.encode(x, shape=(1,3)) for x in text])
out = model.listener(data)

d = {x:[] for x in [KEY_TEXT, KEY_OUT, KEY_ROUND]}
for t, o in zip(text, out):
    d[KEY_TEXT].append(t)
    o = o.numpy()
    d[KEY_OUT].append(o)
    o = np.round(o)
    d[KEY_ROUND].append(o)
df = pd.DataFrame(data=d)
display(df)