In [44]:
!pip install tensorflow
!pip install keras



In [45]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler

def Positional_Encoding(dim_subsecventa, dim_embedding):
    factori_scalare = np.array([1 / (10000 ** (2 * (pozitie_embedding // 2) / dim_embedding)) for pozitie_embedding in range(dim_embedding)])  # (1, dim_embedding)
    pozitii_initiale = np.array([[p] for p in range(dim_subsecventa)])  # (dim_subsecventa, 1)

    valori = pozitii_initiale * factori_scalare  # (dim_subsecventa, dim_embedding)
    # esantioanele au initial pozitiile 0, 1, 2, etc,
    # pozitiile vor deveni arrays de dimensiune dim_embedding,
    # fiecare element din embedding fiind pozitia initiala a esantionului * factor de scalare
    rezultat = np.zeros((dim_subsecventa, dim_embedding))
    rezultat[:, 0::2] = np.sin(valori[:, 0::2])
    rezultat[:, 1::2] = np.cos(valori[:, 1::2])
    return rezultat


In [46]:
def Self_Attention(layer_precedent, dim_embedding):
    # Q = ce informatie cauta un esantion de la altele, K = ce informatie detine fiecare, V = informatie deitnuta in detaliu
    Q = layers.Dense(dim_embedding)(layer_precedent)
    #print(Q.shape)
    # transofrmam datele din stratul precedent pentru dea aprofunda informatia deja existenta
    K = layers.Dense(dim_embedding)(layer_precedent)
    V = layers.Dense(dim_embedding)(layer_precedent)

    scoruri_atentie = layers.Lambda(lambda x: tf.matmul(x[0], x[1], transpose_b=True))([Q, K])
    # scoruri_atentie e de dimensiune (lungime_secventa, lungime_secventa)
    # deci fiecare esantion din secventa are un scor de atentie fata de restul
    # prin Q * K.T fiecare esantion vede daca are ce obtine de la restul
    ponderi_atentie = layers.Softmax(axis=-1)(scoruri_atentie)
    # fiecare esantion primeste de la fiecare ce a cautat
    rezultat = layers.Lambda(lambda x: tf.matmul(x[0], x[1]))([ponderi_atentie, V])
    return rezultat

In [47]:
def Encoder(layer_precedent, dim_embedding, dim_feed_forward):
    self_attention = Self_Attention(layer_precedent, dim_embedding)
    self_attention += layer_precedent
    self_attention = layers.LayerNormalization()(self_attention)

    feed_forward = layers.Dense(dim_feed_forward, activation='relu')(self_attention)
    # primul strat din ff mareste dimensiunea pentru a aprofunda informatia din self_attention, iar al doilea aduce dimensiunea la loc pentru a se potrivi cu dim_encoder
    feed_forward = layers.Dense(dim_embedding)(feed_forward)

    encoder = feed_forward + self_attention
    encoder = layers.LayerNormalization()(encoder)
    return encoder

In [48]:
def Transformer(dim_subsecventa, dim_embedding, dim_vocab, nr_encoders, dim_feed_forward):
    tensor_intrare = Input(shape=(dim_subsecventa,))
    layer_embedding = layers.Embedding(dim_vocab, dim_embedding)(tensor_intrare)
    pos_encoding = Positional_Encoding(dim_subsecventa, dim_embedding)

    tensor_pos_encoding = tf.convert_to_tensor(pos_encoding, dtype=tf.float32)
    # adaugam inca o dimensiune la tensor pt a sti din ce batch face parte
    tensor_pos_encoding = tf.expand_dims(tensor_pos_encoding, axis=0)
    layer_pos_encoding = layer_embedding + tensor_pos_encoding

    layers_encoder = layer_pos_encoding
    for _ in range(nr_encoders):
        layers_encoder = Encoder(layers_encoder, dim_embedding, dim_feed_forward)
    # layers encoder are acum dim (batch, dim_subsecventa, dim_embedding)

    # obtinem informatie despre fiecare subsecventa
    layer_medie_pe_subsecvente = layers.GlobalAveragePooling1D()(layers_encoder)
    # un strat Dense care produce prob
    # activare softmax pt probabilitati
    layer_final = layers.Dense(dim_vocab, activation='softmax')(layer_medie_pe_subsecvente)

    return Model(tensor_intrare, layer_final)

In [49]:
csv_file = pd.read_csv('/content/sample_data/shortjokes.csv')

NUM_JOKES = 50000

jokes = csv_file['Body'].head(NUM_JOKES).astype(str).to_numpy()
train_jokes, val_jokes = train_test_split(jokes, test_size=0.2, random_state=42)
print(jokes)

dim_subsecventa = 30
nr_encoders = 2
dim_embedding = 64
dim_feed_forward = 256

['[me narrating a documentary about narrators] "I can\'t hear what they\'re saying cuz I\'m talking"'
 'Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men.'
 "I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper."
 ...
 'Why is faith greater than science? Science made buildings and planes but faith brought them together.'
 'There is a new Barbie doll on the market -  Junkie Barbie ...complete with needle tracks'
 'My Friend Told Me His Girlfriend Talks a lot in Her Sleep.. ..Apparently "I Know" wasn\'t the right answer.']


In [50]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(jokes)

secvente = tokenizer.texts_to_sequences(jokes)
secvente = pad_sequences(secvente, maxlen=dim_subsecventa, padding='post', truncating='post')


In [51]:
X = []
y = []

for sec in secvente:
    for i in range(1, len(sec)):
        X.append(sec[:i])
        y.append(sec[i])

X = pad_sequences(X, maxlen=dim_subsecventa, padding='pre')
y = np.array(y)

In [52]:
dim_vocab = tokenizer.num_words or len(tokenizer.word_index) + 1
model = Transformer(dim_subsecventa, dim_embedding, dim_vocab, nr_encoders, dim_feed_forward)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size=64, epochs=10, validation_split=0.1)

Epoch 1/10
[1m20391/20391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 6ms/step - accuracy: 0.4481 - loss: 4.2749 - val_accuracy: 0.4758 - val_loss: 3.8243
Epoch 2/10
[1m20391/20391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 6ms/step - accuracy: 0.4775 - loss: 3.7506 - val_accuracy: 0.4938 - val_loss: 3.6337
Epoch 3/10
[1m20391/20391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 6ms/step - accuracy: 0.4961 - loss: 3.5474 - val_accuracy: 0.4969 - val_loss: 3.5962
Epoch 4/10
[1m20391/20391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 6ms/step - accuracy: 0.5045 - loss: 3.4379 - val_accuracy: 0.5068 - val_loss: 3.5027
Epoch 5/10
[1m20391/20391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 6ms/step - accuracy: 0.5065 - loss: 3.3871 - val_accuracy: 0.5093 - val_loss: 3.4894
Epoch 6/10
[1m20391/20391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 6ms/step - accuracy: 0.5104 - loss: 3.3261 - val_accuracy: 0.5112 - val_loss:

<keras.src.callbacks.history.History at 0x7892109479d0>

In [105]:
def genereaza_gluma(model, tokenizer, dim_subsecventa, prompt, dim_maxima_gluma, cuvinte_enervante=None):
    if cuvinte_enervante is None:
        cuvinte_enervante = set()

    secventa = tokenizer.texts_to_sequences([prompt])
    secventa = pad_sequences(secventa, maxlen=dim_subsecventa, padding='pre')

    generated_text = prompt
    cuvinte_generate = set(prompt.split())

    for _ in range(dim_maxima_gluma):
        predictie = model.predict(secventa, verbose=0)
        sorted_indices = np.argsort(-predictie[0])

        next_token = ''
        for idx in sorted_indices:
            candidate = tokenizer.index_word.get(idx, '')
            if candidate not in cuvinte_enervante and candidate not in cuvinte_generate and candidate != '':
                next_token = candidate
                next_token_index = idx
                break

        if next_token == '':
            break

        generated_text += ' ' + next_token
        cuvinte_generate.add(next_token)
        secventa = tf.concat([secventa[:, 1:], tf.constant([[next_token_index]])], axis=1)

    return generated_text


In [113]:
cuvinte_de_evitat = {'joke', 'idea', 'of'}
inceput = 'A'
gluma_generata = genereaza_gluma(model, tokenizer, dim_subsecventa, inceput, 10, cuvinte_de_evitat)
print(gluma_generata)



A man walks into a bar and asks his wife says
