In [103]:
!pip install tensorflow
!pip install keras



In [104]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler

dim_subsecventa = 34
nr_encoders = 2
dim_embedding = 128
dim_feed_forward = 256

def Positional_Encoding(dim_subsecventa, dim_embedding):
    factori_scalare = np.array([1 / (10000 ** (2 * (pozitie_embedding // 2) / dim_embedding)) for pozitie_embedding in range(dim_embedding)])  # (1, dim_embedding)
    pozitii_initiale = np.array([[p] for p in range(dim_subsecventa)])  # (dim_subsecventa, 1)

    valori = pozitii_initiale * factori_scalare  # (dim_subsecventa, dim_embedding)
    # token_ids au initial pozitiile 0, 1, 2, etc,
    # pozitiile vor deveni arrays de dimensiune dim_embedding,
    # fiecare element din embedding fiind pozitia initiala a esantionului * factor de scalare
    rezultat = np.zeros((dim_subsecventa, dim_embedding))
    rezultat[:, 0::2] = np.sin(valori[:, 0::2])
    rezultat[:, 1::2] = np.cos(valori[:, 1::2])
    return rezultat


In [105]:
def Self_Attention(layer_precedent, dim_embedding):
    num_heads = 4
    dim_head = dim_embedding // num_heads

    # Q = ce informatie cauta un esantion de la altele, K = ce informatie detine fiecare, V = informatie deitnuta in detaliu
    Q = layers.Dense(dim_embedding)(layer_precedent) # (batch, dim_subsecventa, dim_embedding)
    print(Q.shape, "q")
    # transofrmam datele din stratul precedent pentru a aprofunda informatia deja existenta
    K = layers.Dense(dim_embedding)(layer_precedent)
    V = layers.Dense(dim_embedding)(layer_precedent)

    scoruri_atentie = layers.Lambda(lambda x: tf.matmul(x[0], x[1], transpose_b=True))([Q, K]) # (batch, dim_subsecventa, dim_subsecventa)
    print(scoruri_atentie.shape, "atentie")
    # deci fiecare esantion din secventa are un scor de atentie fata de restul
    # prin Q * K.T fiecare esantion vede daca are ce obtine de la restul
    ponderi_atentie = layers.Softmax(axis=-1)(scoruri_atentie)

    # fiecare esantion primeste de la fiecare ce a cautat
    rezultat = layers.Lambda(lambda x: tf.matmul(x[0], x[1]))([ponderi_atentie, V]) # (batch, dim_subsecventa, dim_embedding)
    print(rezultat.shape, "rezultat")
    return rezultat

In [106]:
def Encoder(layer_precedent, dim_embedding, dim_feed_forward):
    self_attention = Self_Attention(layer_precedent, dim_embedding)
    self_attention += layer_precedent
    self_attention = layers.LayerNormalization()(self_attention)

    feed_forward = layers.Dense(dim_feed_forward, activation='relu')(self_attention)
    # primul strat din ff mareste dimensiunea pentru a aprofunda informatia din self_attention, iar al doilea aduce dimensiunea la loc pentru a se potrivi cu dim_encoder
    feed_forward = layers.Dense(dim_embedding)(feed_forward)

    encoder = feed_forward + self_attention
    encoder = layers.LayerNormalization()(encoder)
    return encoder

In [107]:
def Transformer(dim_subsecventa, dim_embedding, dim_vocab, nr_encoders, dim_feed_forward):
    tensor_intrare = Input(shape=(dim_subsecventa,))
    layer_embedding = layers.Embedding(dim_vocab, dim_embedding)(tensor_intrare)
    pos_encoding = Positional_Encoding(dim_subsecventa, dim_embedding)

    tensor_pos_encoding = tf.convert_to_tensor(pos_encoding, dtype=tf.float32)
    # adaugam inca o dimensiune la tensor pt a sti din ce batch face parte
    tensor_pos_encoding = tf.expand_dims(tensor_pos_encoding, axis=0)
    layer_pos_encoding = layer_embedding + tensor_pos_encoding

    layers_encoder = layer_pos_encoding
    for _ in range(nr_encoders):
        layers_encoder = Encoder(layers_encoder, dim_embedding, dim_feed_forward)
    # layers encoder are acum dim (batch, dim_subsecventa, dim_embedding)

    # obtinem informatie despre fiecare subsecventa
    layer_medie_pe_subsecvente = layers.GlobalAveragePooling1D()(layers_encoder) # (batch, dim_embedding)
    # un strat Dense care produce prob
    # activare softmax pt probabilitati
    layer_final = layers.Dense(dim_vocab, activation='softmax')(layer_medie_pe_subsecvente) # (batch, dim_vocab)

    return Model(tensor_intrare, layer_final)

In [108]:
csv_file = pd.read_csv('/content/sample_data/train.csv')

NUM_JOKES = 20000

csv_file['joke'] = csv_file['question'] + ' <sep> ' + csv_file['response']
jokes = csv_file['joke'].astype(str).tolist()
jokes = jokes[:NUM_JOKES]
print(jokes)




In [109]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(jokes)

secvente = tokenizer.texts_to_sequences(jokes)
secvente = pad_sequences(secvente, maxlen=dim_subsecventa, padding='post', truncating='post')
print(secvente)
# pt fiecare secveta va trb sa ghiceasca urmatorul cuvant

[[   5   70    7 ...    0    0    0]
 [ 307 7367   54 ...    0    0    0]
 [2901 2752 2446 ...    0    0    0]
 ...
 [  17   83    4 ...    0    0    0]
 [  34  217   25 ...    0    0    0]
 [  17   19    9 ...    0    0    0]]


In [110]:
X = []
y = []

for sec in secvente:
    for i in range(1, len(sec)):
        X.append(sec[:i])
        y.append(sec[i])

X = pad_sequences(X, maxlen=dim_subsecventa, padding='pre')
y = np.array(y)
print(X)

[[ 0  0  0 ...  0  0  5]
 [ 0  0  0 ...  0  5 70]
 [ 0  0  0 ...  5 70  7]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0 17 ...  0  0  0]
 [ 0 17 19 ...  0  0  0]]


In [None]:
dim_vocab = len(tokenizer.word_index) + 1
model = Transformer(dim_subsecventa, dim_embedding, dim_vocab, nr_encoders, dim_feed_forward)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size=64, epochs=10, validation_split=0.1)

(None, 34, 128) q
(None, 34, 4, 32) qs
(None, 34, 4, 4) atentie
(None, 34, 4, 4) ponderi
(None, 34, 4, 32) rezultat
(None, 34, 128) concat
(None, 34, 128) final
(None, 34, 128) q
(None, 34, 4, 32) qs
(None, 34, 4, 4) atentie
(None, 34, 4, 4) ponderi
(None, 34, 4, 32) rezultat
(None, 34, 128) concat
(None, 34, 128) final
Epoch 1/10
[1m9282/9282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 8ms/step - accuracy: 0.4345 - loss: 4.3252 - val_accuracy: 0.4645 - val_loss: 3.9564
Epoch 2/10
[1m9282/9282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 8ms/step - accuracy: 0.4595 - loss: 3.8740 - val_accuracy: 0.4718 - val_loss: 3.8195
Epoch 3/10
[1m9282/9282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 8ms/step - accuracy: 0.4730 - loss: 3.6561 - val_accuracy: 0.4940 - val_loss: 3.6224
Epoch 4/10
[1m5912/9282[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m25s[0m 7ms/step - accuracy: 0.4951 - loss: 3.4450

In [None]:

def genereaza_gluma(model, tokenizer, dim_subsecventa, inceput, dim_maxima_gluma, cuvinte_enervante, temperature):
    secventa = tokenizer.texts_to_sequences([inceput])
    secventa = pad_sequences(secventa, maxlen=dim_subsecventa, padding='pre')

    generated_text = inceput
    cuvinte_generate = set(inceput.lower().split())

    for _ in range(dim_maxima_gluma):
        predictie = model.predict(secventa, verbose=0)[0]

        predictie = np.asarray(predictie).astype('float64')
        predictie = np.log(predictie + 1e-8) / temperature
        exp_preds = np.exp(predictie)
        predictie = exp_preds / np.sum(exp_preds)

        index_ales = np.random.choice(len(predictie), p=predictie)
        token_ales = tokenizer.index_word.get(index_ales, '')


        retry_count = 0
        while (token_ales in cuvinte_enervante or token_ales in cuvinte_generate or token_ales == '') and retry_count < 10:
            index_ales = np.random.choice(len(predictie), p=predictie)
            token_ales = tokenizer.index_word.get(index_ales, '')
            retry_count += 1

        if token_ales == '':
            break

        if token_ales == 'sep':
          token_ales = '......'

        generated_text += ' ' + token_ales
        cuvinte_generate.add(token_ales)
        secventa = tf.concat([secventa[:, 1:], tf.constant([[index_ales]])], axis=1)

    return generated_text


In [None]:
cuvinte_de_evitat = {'joke', 'idea', 'of', 'chicken', 'little'}
inceput = 'I think'
temperature = 0.5
gluma_generata = genereaza_gluma(model, tokenizer, dim_subsecventa, inceput, 80, cuvinte_de_evitat, temperature)
print(gluma_generata)

