In [1]:
from math import floor
#import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import time
import pandas as pd
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
from tensorflow.keras.layers import Embedding, Conv1D, ReLU, MaxPool1D, Dense, Dropout
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
tf.__version__

'2.8.0'

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
df = pd.read_csv("../data/yelp.csv")
train_df, test_val_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_val_df,test_size=0.5, random_state=42)

In [3]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [4]:
hparams = {
    "batch_size": 128,
    "conv_channels": 24,
    "conv_kernel_size": 4,
    "conv_stride": 2,
    "conv_padding": 'valid',
    "conv_dilation": 1,
    "embedding_dim": 100,
    "dropout_rate": 0.25,
    "output_size": 1,
    "learning_rate": 0.005,
    "max_num_words": 10000,
    "max_sequence_length": 250}

In [5]:
tokenizer = Tokenizer(num_words=hparams["max_num_words"])
tokenizer.fit_on_texts(train_df["text"])

def prep_text(texts, tokenizer, max_sequence_length):
    # Turns text into into padded sequences.
    for text in texts:
        text_sequences = tokenizer.texts_to_sequences([text])
        yield sequence.pad_sequences(
            text_sequences, maxlen=max_sequence_length,
            padding='post', truncating='post').reshape(-1)

#text_train = lambda: prep_text(train_df["text"], tokenizer, hparams["max_sequence_length"])
#text_test = lambda: prep_text(test_df["text"], tokenizer, hparams["max_sequence_length"])
#text_val = lambda: prep_text(val_df["text"], tokenizer, hparams["max_sequence_length"])

In [6]:
def dataset_callable(df, tokenizer, max_sequence_length, batch_size):
    text_prepped = lambda: prep_text(df["text"], tokenizer, max_sequence_length)
    dataset = lambda: tf.data.Dataset.from_generator(
        lambda: zip(text_prepped(),(i for i in df["stars"])),
        output_signature=(
        tf.TensorSpec(shape=(max_sequence_length),dtype=tf.int32),
        tf.TensorSpec(shape=(),dtype=tf.int32))).batch(batch_size)
    return dataset

train_dataset = dataset_callable(
    train_df, tokenizer, hparams["max_sequence_length"], hparams["batch_size"])

test_dataset = dataset_callable(
    test_df, tokenizer, hparams["max_sequence_length"], hparams["batch_size"])

val_dataset = dataset_callable(
    val_df, tokenizer, hparams["max_sequence_length"], hparams["batch_size"])


# train_dataset = lambda: tf.data.Dataset.from_generator(
#     lambda: zip(text_train,(i for i in train_df["stars"])),
#     output_signature=(
#         tf.TensorSpec(shape=(hparams["max_sequence_length"]),dtype=tf.int32),
#         tf.TensorSpec(shape=(),dtype=tf.int32))).batch(hparams["batch_size"])

# test_dataset = lambda: tf.data.Dataset.from_generator(
#     lambda: zip(text_test,(i for i in test_df["stars"])),
#     output_signature=(
#         tf.TensorSpec(shape=(hparams["max_sequence_length"]),dtype=tf.int32),
#         tf.TensorSpec(shape=(),dtype=tf.int32))).batch(hparams["batch_size"])

# val_dataset = lambda: tf.data.Dataset.from_generator(
#     lambda: zip(text_val,(i for i in val_df["stars"])),
#     output_signature=(
#         tf.TensorSpec(shape=(hparams["max_sequence_length"]),dtype=tf.int32),
#         tf.TensorSpec(shape=(),dtype=tf.int32))).batch(hparams["batch_size"])

In [7]:
class SimpleTextCNN(Model):
    def __init__(self,sequence_length=64,embedding_input_dim=1024,
        embedding_dim=64, conv_channels=24,conv_kernel_size=4,conv_stride=2,
        conv_padding='valid',conv_dilation=1, dropout_rate=0.25,
        output_size=1):
        super(SimpleTextCNN, self).__init__()
        self.embedding = Embedding(
            input_dim=embedding_input_dim,
            output_dim=embedding_dim,
            input_length=sequence_length)
        self.conv1d = Conv1D(
            filters=conv_channels,
            kernel_size=conv_kernel_size,
            strides=conv_stride,
            padding=conv_padding,
            data_format='channels_last',
            dilation_rate=conv_dilation)
        if conv_padding == 'valid':
            self.conv_output_layer_size = floor(
                (sequence_length - conv_dilation*(conv_kernel_size - 1) - 1) /
                conv_stride + 1)
        elif conv_padding == 'same':
            self.conv_output_layer_size = floor(
                (sequence_length - (conv_dilation-1)*(conv_kernel_size - 1) - 1) /
                conv_stride + 1)
        else:
            raise(ValueError(f"conv_padding must be one of 'valid' or 'same'" +
            f"but received {conv_padding}"))
        self.relu = ReLU()
        self.maxp1d = MaxPool1D(pool_size=self.conv_output_layer_size)
        self.dropout = Dropout(rate=dropout_rate)
        self.dense = Dense(units=output_size)

    def call(self,x):
        x = self.embedding(x)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.maxp1d(x)
        x = self.dropout(x)
        x = tf.squeeze(x)
        x = self.dense(x)
        return x

In [16]:
tf.random.set_seed(42)
model = SimpleTextCNN(
  sequence_length=hparams["max_sequence_length"],
  embedding_input_dim=hparams["max_num_words"],
  embedding_dim=hparams["embedding_dim"],
  conv_channels=hparams["conv_channels"],
  conv_kernel_size=hparams["conv_kernel_size"],
  conv_stride=hparams["conv_stride"],
  conv_padding=hparams["conv_padding"],
  conv_dilation=hparams["conv_dilation"],
  dropout_rate=hparams["dropout_rate"],
  output_size=hparams["output_size"])

loss_object = tf.keras.losses.MeanSquaredError()

optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_rmse = tf.keras.metrics.RootMeanSquaredError(name='train_rmse')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_rmse = tf.keras.metrics.RootMeanSquaredError(name='val_rmse')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_rmse = tf.keras.metrics.RootMeanSquaredError(name='test_rmse')

@tf.function
def train_step(model, features, targets, loss_object, optimizer,
    train_loss, train_rmse):
    with tf.GradientTape() as tape:
    # training=True is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
        predictions = model(features, training=True)
        loss = loss_object(targets, predictions)
        gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_rmse(targets, predictions)

@tf.function
def test_step(model, features, targets, test_loss, test_rmse):
  # training=False is only needed if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
    predictions = model(features, training=False)
    t_loss = loss_object(targets, predictions)

    test_loss(t_loss)
    test_rmse(targets, predictions)

In [17]:
def train(model, train_loss, train_dataset, train_rmse,
    val_loss, val_dataset, val_rmse, epochs=5):
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train RMSE':^9} " +
          f"| {'Val Loss':^10} | {'Val RMSE':^9} | {'Elapsed':^9}")
    print("-"*60)
    best_val_loss = None
    for epoch in range(epochs):
        t0_epoch = time.time()
        train_loss.reset_state()
        train_rmse.reset_state()
        val_loss.reset_state()
        val_rmse.reset_state()

        for train_features, train_targets in train_dataset():
            train_step(model, train_features, train_targets,
            loss_object, optimizer, train_loss, train_rmse)

        for val_features, val_targets in val_dataset():
            test_step(model, val_features, val_targets, val_loss, val_rmse)

        time_elapsed = time.time() - t0_epoch
        print(f"{epoch+1:^7} | {train_loss.result():^12.6f} | {train_rmse.result():^9.2f}" +
              f" | {val_loss.result():^10.6f}" +
              f" | {val_rmse.result():^9.2f} | {time_elapsed:^9.2f}")

        if best_val_loss is not None and val_loss.result() > best_val_loss:
            print("Stopping early: Val loss increased")
            break
        else:
            best_val_loss = val_loss.result()
    print(f"Training completed! Final validation RMSE: {val_rmse.result():.2f}.")



In [18]:
train(model, train_loss, train_dataset, train_rmse, val_loss, val_dataset, val_rmse, epochs=25)

Start training...

 Epoch  |  Train Loss  | Train RMSE |  Val Loss  | Val RMSE  |  Elapsed 
------------------------------------------------------------
   1    |   7.919500   |   2.82    |  1.586187  |   1.26    |   3.32   
   2    |   2.078290   |   1.44    |  1.455616  |   1.21    |   2.31   
   3    |   1.873044   |   1.37    |  1.344856  |   1.16    |   2.34   
   4    |   1.733359   |   1.32    |  1.246642  |   1.12    |   2.31   
   5    |   1.535503   |   1.24    |  1.094809  |   1.04    |   2.32   
   6    |   1.353235   |   1.16    |  0.987829  |   0.99    |   2.32   
   7    |   1.197986   |   1.10    |  0.955442  |   0.98    |   2.36   
   8    |   1.117679   |   1.06    |  0.937782  |   0.97    |   2.34   
   9    |   1.042802   |   1.02    |  0.936548  |   0.97    |   2.37   
  10    |   0.962515   |   0.98    |  0.931646  |   0.96    |   2.30   
  11    |   0.918046   |   0.96    |  0.924252  |   0.96    |   2.47   
  12    |   0.877753   |   0.94    |  0.914719  |   0.9

In [12]:
os.environ["PATH"]

'C:\\Users\\grego\\Anaconda3;C:\\Users\\grego\\Anaconda3\\Library\\mingw-w64\\bin;C:\\Users\\grego\\Anaconda3\\Library\\usr\\bin;C:\\Users\\grego\\Anaconda3\\Library\\bin;C:\\Users\\grego\\Anaconda3\\Scripts;C:\\Users\\grego\\Anaconda3\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6\\bin;C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6\\libnvvp;C:\\Windows\\system32;C:\\Windows;C:\\Program Files\\Git\\cmd;C:\\Windows\\System32\\Wbem;C:\\Windows\\System32\\WindowsPowerShell\\v1.0;C:\\Windows\\System32\\OpenSSH;C:\\Program Files (x86)\\NVIDIA Corporation\\PhysX\\Common;C:\\Program Files\\Java\\jdk-14.0.1\\bin;C:\\Users\\grego\\AppData\\Local\\GitHubDesktop\\app-2.5.2\\resources\\app\\git\\cmd;C:\\Users\\grego\\AppData\\Local\\Programs\\MiKTeX 2.9\\miktex\\bin\\x64;C:\\Program Files\\Snowflake SnowSQL;C:\\Users\\grego\\Documents\\GitHub;C:\\Program Files\\maven\\apache-maven-3.6.3\\bin;C:\\Users\\grego\\Anaconda3;C:\\Users\\grego\\Anaconda3\\Library\\mingw-w6