In [2]:
import string
import os
import numpy as np
import yaml
import re

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

SEQUENCE_LENGTH = 2
LENGTH = SEQUENCE_LENGTH + 1

In [3]:
num_threads = 5
os.environ["OMP_NUM_THREADS"] = "5"
os.environ["TF_NUM_INTRAOP_THREADS"] = "5"
os.environ["TF_NUM_INTEROP_THREADS"] = "5"
tf.config.threading.set_inter_op_parallelism_threads(num_threads)
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
tf.config.set_soft_device_placement(True)

In [4]:
with open('/home/tina/Downloads/ai.yml', 'r') as file:
    data = yaml.safe_load(file)
    
# print(data['conversations'])  

all_texts = []
for inner_list in data['conversations']:
    for text in inner_list:
        all_texts.append(text)
        
# print(all_texts)        

In [5]:
cleaned_texts = []

pattern = r'[^a-zA-Z\s]'
for text in all_texts:
    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = cleaned_text.lower()
    cleaned_texts.append(cleaned_text)
# print(cleaned_texts)

In [6]:
def clean_doc(doc):
    tokens = doc.split()
    tokens = [' ' if w in string.punctuation else w for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [7]:
all_tokens = []

for text in all_texts:
    tokens = clean_doc(text)
    all_tokens.extend(tokens)
print(len(all_tokens))    

1185


In [7]:
# plato = open("/home/tina/Downloads/plato.txt", "r").read()
# text = plato
# tokens = clean_doc(text)

# print (len(tokens))

In [8]:
number_of_unique_tokens = len(set(tokens))

print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % number_of_unique_tokens)
print('These are the first 200 tokens: %s' % tokens[:200])

Total Tokens: 5
Unique Tokens: 5
These are the first 200 tokens: ['i', 'am', 'just', 'an', 'artificial']


In [9]:
SEQUENCE_LENGTH = 2
LENGTH = SEQUENCE_LENGTH + 1
sequences = list()
for i in range(LENGTH, len(tokens)):
    seq = tokens[i-LENGTH:i]
    line = ' '.join(seq)
    sequences.append(line)

print ('Total Sequences: %d' % len(sequences))
print ('This is the first sequence: {0}'.format(sequences[0]))

Total Sequences: 2
This is the first sequence: i am just


In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)
vocab_size = number_of_unique_tokens + 1

In [11]:
sequences0 = np.array(sequences)
X, y = sequences0[:,:-1], sequences0[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [12]:
dimensions_to_represent_word = 100
 
model = Sequential()
model.add(Embedding(vocab_size, SEQUENCE_LENGTH, input_length=SEQUENCE_LENGTH))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

2023-09-18 10:25:49.022474: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-18 10:25:49.023842: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 2)              12        
                                                                 
 lstm (LSTM)                 (None, 2, 100)            41200     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 6)                 606       
                                                                 
Total params: 132318 (516.87 KB)
Trainable params: 132318 (516.87 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=100)
model.save("./data/chat-model-prediction.mdl")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

INFO:tensorflow:Assets written to: ./data/chat-model-prediction.mdl/assets


In [14]:
print (X.shape)
prediction = model.predict(X[0].reshape(1,SEQUENCE_LENGTH))
print (prediction.shape)
print (prediction)

(2, 2)
(1, 6)
[[3.1910190e-06 3.8800863e-05 6.5214884e-01 1.4950907e-05 3.4775102e-01
  4.3320964e-05]]


In [21]:
loaded_model = load_model("./data/chat-model-prediction.mdl")
test_texts = [
    'what is',
    'are you ',
    'im not that '
]

test_sequences = [clean_doc(text) for text in test_texts]
print(test_sequences)
test_sequences = tokenizer.texts_to_sequences(test_sequences)
print(test_sequences)

for i, test_sequence in enumerate(test_sequences):
    if len(test_sequence) < SEQUENCE_LENGTH:
        continue
    input_sequence = test_sequence[-SEQUENCE_LENGTH:]
    input_sequence = np.array(input_sequence).reshape(1, -1)  
    predicted_probs = loaded_model.predict(input_sequence)
    
    predicted_word_index = np.argmax(predicted_probs)
    
    predicted_word = tokenizer.index_word[predicted_word_index]
    print(predicted_word)
    print(f"Input: {' '.join(test_sequence)} | Predicted: {predicted_word}")

[['what', 'is'], ['are', 'you'], ['im', 'not', 'that']]
[[], [], []]
