In [8]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras import models, layers

In [2]:
num_threads = 5
os.environ["OMP_NUM_THREADS"] = "5"
os.environ["TF_NUM_INTRAOP_THREADS"] = "5"
os.environ["TF_NUM_INTEROP_THREADS"] = "5"
tf.config.threading.set_inter_op_parallelism_threads(num_threads)
tf.config.threading.set_intra_op_parallelism_threads(num_threads)
tf.config.set_soft_device_placement(True)

In [3]:
MAX_WORDS_IN_SENTENCE = 700
WORD2VEC_DIMENSIONS = 100

In [4]:
embeddings_index = {}
with open('/home/tina/Downloads/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [5]:
def text_to_word_vectors(text, vector_space_dimension):    
    sentence_vectors = []
    for word in text.lower().split():
        if word in embeddings_index and len(sentence_vectors) < MAX_WORDS_IN_SENTENCE:
            sentence_vectors.append(embeddings_index[word])    
    if len(sentence_vectors) > 0:
        pad_len = vector_space_dimension-len(sentence_vectors)      
        if pad_len > 0:
            padding = [np.zeros_like(sentence_vectors[0])] * pad_len
            sentence_vectors = sentence_vectors + padding
    return sentence_vectors

def clean_dataframe(df):
    cleaned_df = df.copy()
    pattern = r'[^a-zA-Z\s]'
    cleaned_df['text'] = cleaned_df['text'].astype(str).apply(lambda x: re.sub(pattern, '', x))    
    return cleaned_df

In [6]:
#

In [9]:
dataset = pd.read_csv('./data/recipe-dataset.csv')
dataset = clean_dataframe(dataset)
texts = dataset['text'].astype(str)
labels = dataset['label']

In [10]:
data = [text_to_word_vectors(text, MAX_WORDS_IN_SENTENCE) for text in texts]
data = np.array(data)
targets = np.array(labels).astype("float32")

In [11]:
test_x = data[:1500]
test_x =  test_x.reshape(len(test_x), MAX_WORDS_IN_SENTENCE * WORD2VEC_DIMENSIONS)
test_y = targets[:1500]

train_x = data[1500:]
train_x = train_x.reshape(len(train_x), MAX_WORDS_IN_SENTENCE * WORD2VEC_DIMENSIONS)
train_y = targets[1500:]

In [12]:
train_x.shape, train_y.shape

((1828, 70000), (1828,))

In [13]:
test_x.shape, test_y.shape

((1500, 70000), (1500,))

In [14]:
model = models.Sequential()
model.add(layers.Dense(50, activation="relu", input_shape=(MAX_WORDS_IN_SENTENCE * WORD2VEC_DIMENSIONS, )))  
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid")) 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                3500050   
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
Total params: 3505201 (13.37 MB)
Trainable params: 35052

2023-09-15 08:47:40.744395: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-15 08:47:40.745350: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [15]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
results = model.fit(train_x, train_y, epochs=5, batch_size=32, validation_data=(test_x, test_y))

Epoch 1/5


2023-09-15 08:47:43.274350: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 511840000 exceeds 10% of free system memory.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
model.save('./data/keras-w2v-recipe-100d.mdl')
loaded_model = models.load_model('./data/keras-w2v-recipe-100d.mdl')
scores = loaded_model.evaluate(test_x, test_y, verbose=0)
print("Loaded Model Accuracy: %.2f%%" % (scores[1] * 100))

INFO:tensorflow:Assets written to: ./data/keras-w2v-recipe-100d.mdl/assets


INFO:tensorflow:Assets written to: ./data/keras-w2v-recipe-100d.mdl/assets


Loaded Model Accuracy: 99.87%


In [20]:
text = '''
To encourage thoughtful and respectful conversations
'''
v = np.array(text_to_word_vectors(text, MAX_WORDS_IN_SENTENCE))
v = v.reshape(MAX_WORDS_IN_SENTENCE * WORD2VEC_DIMENSIONS,1)
t = tf.convert_to_tensor([v])

loaded_model.predict(t)



array([[7.116231e-05]], dtype=float32)