In [1]:
#import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import time
import pandas as pd
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from nlpy.tf_models.preprocess import prep_text, dataset_callable
from nlpy.tf_models.simple_text_cnn import SimpleTextCNN, train_step, test_step
tf.__version__

  return f(*args, **kwds)
  return f(*args, **kwds)


'2.8.0'

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
df = pd.read_csv("../data/yelp.csv")
train_df, test_val_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_val_df,test_size=0.5, random_state=42)

In [4]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [5]:
hparams = {
    "batch_size": 128,
    "conv_channels": 24,
    "conv_kernel_size": 4,
    "conv_stride": 2,
    "conv_padding": 'valid',
    "conv_dilation": 1,
    "embedding_dim": 100,
    "dropout_rate": 0.25,
    "output_size": 1,
    "learning_rate": 0.005,
    "max_num_words": 10000,
    "max_sequence_length": 250}

In [6]:
tokenizer = Tokenizer(num_words=hparams["max_num_words"])
tokenizer.fit_on_texts(train_df["text"])

In [8]:
train_dataset = dataset_callable(
    train_df["text"], train_df["stars"], tokenizer,
    hparams["max_sequence_length"], hparams["batch_size"])

test_dataset = dataset_callable(
    test_df["text"], test_df["stars"], tokenizer,
    hparams["max_sequence_length"], hparams["batch_size"])

val_dataset = dataset_callable(
    val_df["text"], val_df["stars"], tokenizer,
    hparams["max_sequence_length"], hparams["batch_size"])

In [9]:
tf.random.set_seed(42)
model = SimpleTextCNN(
  sequence_length=hparams["max_sequence_length"],
  embedding_input_dim=hparams["max_num_words"],
  embedding_dim=hparams["embedding_dim"],
  conv_channels=hparams["conv_channels"],
  conv_kernel_size=hparams["conv_kernel_size"],
  conv_stride=hparams["conv_stride"],
  conv_padding=hparams["conv_padding"],
  conv_dilation=hparams["conv_dilation"],
  dropout_rate=hparams["dropout_rate"],
  output_size=hparams["output_size"])

loss_object = tf.keras.losses.MeanSquaredError()

optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_rmse = tf.keras.metrics.RootMeanSquaredError(name='train_rmse')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_rmse = tf.keras.metrics.RootMeanSquaredError(name='val_rmse')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_rmse = tf.keras.metrics.RootMeanSquaredError(name='test_rmse')

In [12]:
def train(model, train_loss, train_dataset, train_rmse,
    val_loss, val_dataset, val_rmse, epochs=5):
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train RMSE':^9} " +
          f"| {'Val Loss':^10} | {'Val RMSE':^9} | {'Elapsed':^9}")
    print("-"*60)
    best_val_loss = None
    for epoch in range(epochs):
        t0_epoch = time.time()
        train_loss.reset_state()
        train_rmse.reset_state()
        val_loss.reset_state()
        val_rmse.reset_state()

        for train_features, train_targets in train_dataset():
            train_step(model, train_features, train_targets,
            loss_object, optimizer, train_loss, train_rmse)

        for val_features, val_targets in val_dataset():
            test_step(model, val_features, val_targets,
            loss_object, val_loss, val_rmse)

        time_elapsed = time.time() - t0_epoch
        print(f"{epoch+1:^7} | {train_loss.result():^12.6f} | {train_rmse.result():^9.2f}" +
              f" | {val_loss.result():^10.6f}" +
              f" | {val_rmse.result():^9.2f} | {time_elapsed:^9.2f}")

        if best_val_loss is not None and val_loss.result() > best_val_loss:
            print("Stopping early: Val loss increased")
            break
        else:
            best_val_loss = val_loss.result()
    print(f"Training completed! Final validation RMSE: {val_rmse.result():.2f}.")



In [13]:
train(model, train_loss, train_dataset, train_rmse, val_loss, val_dataset, val_rmse, epochs=25)

Start training...

 Epoch  |  Train Loss  | Train RMSE |  Val Loss  | Val RMSE  |  Elapsed 
------------------------------------------------------------
   1    |   2.078290   |   1.44    |  1.455616  |   1.21    |   2.89   
   2    |   1.873044   |   1.37    |  1.344856  |   1.16    |   2.41   
   3    |   1.733359   |   1.32    |  1.246642  |   1.12    |   2.38   
   4    |   1.535503   |   1.24    |  1.094809  |   1.04    |   2.34   
   5    |   1.353235   |   1.16    |  0.987829  |   0.99    |   2.36   
   6    |   1.197986   |   1.10    |  0.955442  |   0.98    |   2.34   
   7    |   1.117679   |   1.06    |  0.937782  |   0.97    |   2.35   
   8    |   1.042802   |   1.02    |  0.936548  |   0.97    |   2.39   
   9    |   0.962515   |   0.98    |  0.931646  |   0.96    |   2.41   
  10    |   0.918046   |   0.96    |  0.924252  |   0.96    |   2.42   
  11    |   0.877753   |   0.94    |  0.914719  |   0.95    |   2.44   
  12    |   0.851820   |   0.92    |  0.930068  |   0.9