# A very simple example of text prediction
This example may help you understand how large language models are created.

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Input
from keras.utils import to_categorical

The neural network will be very simple. We will try to teach it the following text.

We will ask the neural network to complete the rest of the sentence when we enter the beginning of the sentence.

In [None]:
text = "Umela inteligence je nejlepsi a zariva budoucnost lidstva matrix"

# Data preparation
Split the sentence into letters and create a set of letters used in the sentence.

LLM does not split the text into letters, but into tokens. A token can be a word or part of a word.

In [None]:
chars = sorted(list(set(text)))
print (chars)

Neural networks work with numbers and symbols. Therefore, we need to create dictionaries that map numbers to characters and vice versa.

In [None]:
char_to_index = {char: i for i, char in enumerate(chars)}
index_to_char = {i: char for i, char in enumerate(chars)}

In [None]:
char_to_index

In [None]:
index_to_char

We'll make the sentences into sequences. The sentence is chopped into sequences of 3 letters. The sequence will be named with the following letter.

In [None]:
seq_length = 3
sequences = []
labels = []
 
for i in range(len(text) - seq_length):
    seq = text[i:i+seq_length]
    label = text[i+seq_length]
    sequences.append([char_to_index[char] for char in seq])
    labels.append(char_to_index[label])

Convert to numpy array

In [None]:
X = np.array(sequences)
y = np.array(labels)

Sample of the first sequence.

The input letters (numbers) Ume are followed by the letter l.

In [None]:
print ("Ume")
print (X[0])

print ("l")
print (y[0])

Converting data to categorical - probability of feature occurrence.

In [None]:
X_train = to_categorical(X, len(chars))
Y_train = to_categorical(y, len(chars))

In [None]:
X_train[0]

In [None]:
Y_train[0]

# Neural network

The neural network will select which category the 3 input letters belong to. 

The category name will be the expected next letter after the 3 input letters.

We will choose the layer SimpleRNN.

In [None]:
model = Sequential()
model.add(Input(shape=(seq_length, len(chars)))) 
model.add(SimpleRNN(50, activation='relu'))
model.add(Dense(len(chars), activation='softmax'))

Training the model

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, epochs=100)

# Starting the model
Pass the first 6 characters to the model and let it generate 50 characters.
* Select the last 3 characters from the string
* Convert the three characters to categorical input data
* Perform a single character prediction
* The prediction returns the probabilities of the different characters
* Select the character with the highest probability of being true
* Add the character at the end of the text and repeat

In [None]:
start_seq = "Umela "
generated_text = start_seq
 
for i in range(50):
    # create input data from last 3 characters
    x = np.array([[char_to_index[char] for char in generated_text[-seq_length:]]])
    
    # convert to categorical
    x_input = to_categorical(x, len(chars))
    
    # character prediction
    prediction = model.predict(x_input)
    print (prediction)
    
    # selection of the most probable character - class number
    next_index = np.argmax(prediction)

    # convert class number to character
    next_char = index_to_char[next_index]
    
    # adding a character to the end of the string
    generated_text += next_char

In [None]:
print (f"Generated text: {generated_text}")