In [None]:
!pip install tensorflow
!pip install keras



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler

dim_subsecventa = 40
nr_encoders = 2
dim_embedding = 128
dim_feed_forward = 256

def Positional_Encoding(dim_subsecventa, dim_embedding):
    factori_scalare = np.array([1 / (10000 ** (2 * (pozitie_embedding // 2) / dim_embedding)) for pozitie_embedding in range(dim_embedding)])  # (1, dim_embedding)
    pozitii_initiale = np.array([[p] for p in range(dim_subsecventa)])  # (dim_subsecventa, 1)

    valori = pozitii_initiale * factori_scalare  # (dim_subsecventa, dim_embedding)
    # token_ids au initial pozitiile 0, 1, 2, etc,
    # pozitiile vor deveni arrays de dimensiune dim_embedding,
    # fiecare element din embedding fiind pozitia initiala a esantionului * factor de scalare
    rezultat = np.zeros((dim_subsecventa, dim_embedding))
    rezultat[:, 0::2] = np.sin(valori[:, 0::2])
    rezultat[:, 1::2] = np.cos(valori[:, 1::2])
    return rezultat


In [None]:
def Self_Attention(layer_precedent, dim_embedding):
    num_heads = 4
    dim_head = dim_embedding // num_heads

    # Q = ce informatie cauta un esantion de la altele, K = ce informatie detine fiecare, V = informatie deitnuta in detaliu
    Q = layers.Dense(dim_embedding)(layer_precedent) # (batch, dim_subsecventa, dim_embedding)
    print(Q.shape, "q")
    # transofrmam datele din stratul precedent pentru a aprofunda informatia deja existenta
    K = layers.Dense(dim_embedding)(layer_precedent)
    V = layers.Dense(dim_embedding)(layer_precedent)

    scoruri_atentie = layers.Lambda(lambda x: tf.matmul(x[0], x[1], transpose_b=True))([Q, K]) # (batch, dim_subsecventa, dim_subsecventa)
    print(scoruri_atentie.shape, "atentie")
    # deci fiecare esantion din secventa are un scor de atentie fata de restul
    # prin Q * K.T fiecare esantion vede daca are ce obtine de la restul
    ponderi_atentie = layers.Softmax(axis=-1)(scoruri_atentie)

    # fiecare esantion primeste de la fiecare ce a cautat
    rezultat = layers.Lambda(lambda x: tf.matmul(x[0], x[1]))([ponderi_atentie, V]) # (batch, dim_subsecventa, dim_embedding)
    print(rezultat.shape, "rezultat")
    return rezultat

In [None]:
def Encoder(layer_precedent, dim_embedding, dim_feed_forward):
    self_attention = Self_Attention(layer_precedent, dim_embedding)
    self_attention += layer_precedent
    self_attention = layers.LayerNormalization()(self_attention)

    feed_forward = layers.Dense(dim_feed_forward, activation='relu')(self_attention)
    # primul strat din ff mareste dimensiunea pentru a aprofunda informatia din self_attention, iar al doilea aduce dimensiunea la loc pentru a se potrivi cu dim_encoder
    feed_forward = layers.Dense(dim_embedding)(feed_forward)

    encoder = feed_forward + self_attention
    encoder = layers.LayerNormalization()(encoder)
    return encoder

In [None]:
def Transformer(dim_subsecventa, dim_embedding, dim_vocab, nr_encoders, dim_feed_forward):
    tensor_intrare = Input(shape=(dim_subsecventa,))
    layer_embedding = layers.Embedding(dim_vocab, dim_embedding)(tensor_intrare)
    pos_encoding = Positional_Encoding(dim_subsecventa, dim_embedding)

    tensor_pos_encoding = tf.convert_to_tensor(pos_encoding, dtype=tf.float32)
    # adaugam inca o dimensiune la tensor pt a sti din ce batch face parte
    tensor_pos_encoding = tf.expand_dims(tensor_pos_encoding, axis=0)
    layer_pos_encoding = layer_embedding + tensor_pos_encoding

    layers_encoder = layer_pos_encoding
    for _ in range(nr_encoders):
        layers_encoder = Encoder(layers_encoder, dim_embedding, dim_feed_forward)
    # layers encoder are acum dim (batch, dim_subsecventa, dim_embedding)

    # obtinem informatie despre fiecare subsecventa
    layer_medie_pe_subsecvente = layers.GlobalAveragePooling1D()(layers_encoder) # (batch, dim_embedding)
    # un strat Dense care produce prob
    # activare softmax pt probabilitati
    layer_final = layers.Dense(dim_vocab, activation='softmax')(layer_medie_pe_subsecvente) # (batch, dim_vocab)

    return Model(tensor_intrare, layer_final)

In [None]:
csv_file = pd.read_csv('/content/sample_data/train.csv')

NUM_JOKES = 50000

csv_file['joke'] = csv_file['question'] + ' <sep> ' + csv_file['response']
jokes = csv_file['joke'].astype(str).tolist()
# jokes = jokes[:NUM_JOKES]
#print(jokes)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(jokes)

secvente = tokenizer.texts_to_sequences(jokes)
secvente = pad_sequences(secvente, maxlen=dim_subsecventa, padding='post', truncating='post')
print(secvente)
# pt fiecare secveta va trb sa ghiceasca urmatorul cuvant

[[    5    69     7 ...     0     0     0]
 [  303  6619    53 ...     0     0     0]
 [ 3358  2770  2366 ...     0     0     0]
 ...
 [    5 10089     3 ...   314   385     0]
 [    7   124   369 ...     0     0     0]
 [    4  2874    74 ...     0     0     0]]


In [None]:
X = []
y = []

for sec in secvente:
    for i in range(1, len(sec)):
        if sec[i] == 0:
            X.append(sec[:i])
            y.append(sec[i])
            break
        else:
            X.append(sec[:i])
            y.append(sec[i])

X = pad_sequences(X, maxlen=dim_subsecventa, padding='pre')
y = np.array(y)
print(X)

[[   0    0    0 ...    0    0    5]
 [   0    0    0 ...    0    5   69]
 [   0    0    0 ...    5   69    7]
 ...
 [   0    0    0 ...  184    4 2874]
 [   0    0    0 ...    4 2874   18]
 [   0    0    0 ... 2874   18  352]]


In [None]:
dim_vocab = len(tokenizer.word_index) + 1
model = Transformer(dim_subsecventa, dim_embedding, dim_vocab, nr_encoders, dim_feed_forward)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size=64, epochs=10, validation_split=0.1)

(None, 44, 128) q
(None, 44, 44) atentie
(None, 44, 128) rezultat
(None, 44, 128) q
(None, 44, 44) atentie
(None, 44, 128) rezultat
Epoch 1/10
[1m15666/15666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 9ms/step - accuracy: 0.0802 - loss: 6.5169 - val_accuracy: 0.1523 - val_loss: 5.7685
Epoch 2/10
[1m15666/15666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 9ms/step - accuracy: 0.1641 - loss: 5.5864 - val_accuracy: 0.1835 - val_loss: 5.5730
Epoch 3/10
[1m15666/15666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 9ms/step - accuracy: 0.1894 - loss: 5.3346 - val_accuracy: 0.1934 - val_loss: 5.4787
Epoch 4/10
[1m15666/15666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 9ms/step - accuracy: 0.1992 - loss: 5.1969 - val_accuracy: 0.2003 - val_loss: 5.4360
Epoch 5/10
[1m15666/15666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 9ms/step - accuracy: 0.2069 - loss: 5.0831 - val_accuracy: 0.2015 - val_loss: 5.3963
Epoch 6/10
[1m15666/15666[

<keras.src.callbacks.history.History at 0x7bc137370690>

In [None]:
def genereaza_gluma(model, tokenizer, dim_subsecventa, inceput, dim_maxima_gluma, cuvinte_de_evitat, temperature):
    secventa = tokenizer.texts_to_sequences([inceput])
    secventa = pad_sequences(secventa, maxlen=dim_subsecventa, padding='pre')

    generated_text = inceput
    cuvinte_generate = inceput.split()
    cuvinte_generate.append('sep')

    for _ in range(dim_maxima_gluma):
        predictie = model.predict(secventa, verbose=0)[0]

        predictie = np.asarray(predictie).astype('float64')
        predictie = np.log(predictie + 1e-8) / temperature
        exp_preds = np.exp(predictie)
        predictie = exp_preds / np.sum(exp_preds)

        index_ales = np.random.choice(len(predictie), p=predictie)
        token_ales = tokenizer.index_word.get(index_ales, '')

        retry_count = 0
        while (token_ales in cuvinte_de_evitat or cuvinte_generate.count(token_ales) >= 2 or token_ales == '') and retry_count < 300:

            index_ales = np.random.choice(len(predictie), p=predictie)
            token_ales = tokenizer.index_word.get(index_ales, '')
            retry_count += 1

        if token_ales == '' or cuvinte_generate.count(token_ales) >= 2:
            break
        cuvinte_generate.append(token_ales)
        if token_ales == 'sep':
          token_ales = '...'

        generated_text += ' ' + token_ales
        secventa = tf.concat([secventa[:, 1:], tf.constant([[index_ales]])], axis=1)

    return generated_text


In [None]:
cuvinte_de_evitat = {'joke', 'dad', 'idea', 'chicken', 'little'}
inceput = 'My cats fight'
temperature = 0.15
gluma_generata = genereaza_gluma(model, tokenizer, dim_subsecventa, inceput, 30, cuvinte_de_evitat, temperature)
print(gluma_generata)



My cats fight and i was at a restaurant ... i think it's a lot of the time


jokes generated:

**My cat** is a sandwich ...... because they have been a big plus

**Knock kncok** who's there ...... it was a minute and the other day

**Knock kncok** who's there ...... a stick

**My friend** was talking about a sign language ...... the other day

**I failed the exam** ...... it was a lot of them

**My cats fight** ..... because they just a beef


In [None]:
vocab = tokenizer.word_index
print(vocab)

