In [4]:
from google.cloud import bigquery
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np


In [2]:

# Initialize a BigQuery client
client = bigquery.Client()

# Define your query
query = """
WITH sparse_feature AS (
  SELECT
    review_number,
    review,
    STRING_AGG(CAST(word_index AS STRING), ', ') AS feature,
    label,
    split
  FROM (
    SELECT
      DISTINCT review_number,
      review,
      word,
      label,
      split
    FROM
      sparse_features_demo.processed_reviews,
      UNNEST(words) AS word
    WHERE
      word IN (SELECT word FROM sparse_features_demo.vocabulary)
  ) AS word_list
  LEFT JOIN
    sparse_features_demo.vocabulary AS topk_words
    ON word_list.word = topk_words.word
  GROUP BY
    review_number,
    review,
    label,
    split
)
SELECT review, feature, label FROM sparse_feature
"""

df = client.query(query).to_dataframe()


def convert_to_labels(text):
    return 0 if 'Negative' in text else 1


df['label'] = df.label.apply(convert_to_labels)
# Convert the 'feature' column from a comma-separated string to a list of integers
df['feature'] = df['feature'].apply(lambda x: list(map(int, x.split(', '))))

# Ensure all sequences are of max_length by padding
max_length = 100  # Define your max sequence length
df['feature'] = df['feature'].apply(lambda x: x + [0] * (max_length - len(x)) if len(x) < max_length else x[:max_length])

X = df['feature'].tolist()
y = df['label'].tolist()

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X,y,  test_size=0.2, random_state=42)

X_test, X_val, y_test, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42
)

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
X_val = np.array(X_val)
y_val = np.array(y_val)


In [7]:
# Hyperparameters
vocab_size = 30000
max_length = 100
model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(max_length,)),
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])





In [8]:

#Create trainnig dataset
batch_size= 100
train_ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train), tf.convert_to_tensor(y_train)))
train_ds = train_ds.shuffle(buffer_size=len(X_train)).batch(batch_size)
print('Input is : ', train_ds)

# Create validation dataset
val_ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_val), tf.convert_to_tensor(y_val)))
val_ds = val_ds.batch(batch_size)

model.compile(optimizer='adam', 
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

history = model.fit(train_ds, validation_data=val_ds, epochs = 1)


Input is :  <_BatchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 172ms/step - accuracy: 0.6713 - loss: 0.5534 - val_accuracy: 0.9151 - val_loss: 0.2280


In [9]:

test_ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_test), tf.convert_to_tensor(y_test)))
test_ds = test_ds.batch(batch_size)
test_loss, test_acc = model.evaluate(test_ds)
print('\nTest accuracy: {}'.format(test_acc))


[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 52ms/step - accuracy: 0.9130 - loss: 0.2318

Test accuracy: 0.9164528846740723


In [24]:
#model.save('model_4.h5')
tf.saved_model.save(model, 'model_4/')

INFO:tensorflow:Assets written to: model_4/assets


INFO:tensorflow:Assets written to: model_4/assets


In [21]:
from google.cloud import storage
import os

def upload_to_gcs(bucket_name, source_folder, destination_blob_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    
    for root, _, files in os.walk(source_folder):
        for file in files:
            if '.h5' in file:
                local_path = os.path.join(root, file)
                blob_path = os.path.relpath(local_path, source_folder)
                blob = bucket.blob(os.path.join(destination_blob_name, blob_path))
                blob.upload_from_filename(local_path)
                print(f"Uploaded {local_path} to {blob_path}")

bucket_name = 'reddit_raw_data_0184598608709384596'
source_folder = '.'
destination_blob_name = 'model_v4/'
print('start upload')

upload_to_gcs(bucket_name, source_folder, destination_blob_name)

start upload
Uploaded .\model_4.h5 to model_4.h5
Uploaded .\.env\Lib\site-packages\h5py\tests\data_files\vlen_string_dset.h5 to .env\Lib\site-packages\h5py\tests\data_files\vlen_string_dset.h5
Uploaded .\.env\Lib\site-packages\h5py\tests\data_files\vlen_string_dset_utc.h5 to .env\Lib\site-packages\h5py\tests\data_files\vlen_string_dset_utc.h5
Uploaded .\.env\Lib\site-packages\h5py\tests\data_files\vlen_string_s390x.h5 to .env\Lib\site-packages\h5py\tests\data_files\vlen_string_s390x.h5


In [43]:
tf.saved_model.save(model, './model5')
model_load_test = tf.keras.models.load_model('model_5/')
#model_load_test = tf.saved_model.load('model_4')
#model_load_test = keras.saving.load_model('model_4.h5')

INFO:tensorflow:Assets written to: ./model5\assets


INFO:tensorflow:Assets written to: ./model5\assets


ValueError: File format not supported: filepath=model_5/. Keras 3 only supports V3 `.keras` files and legacy H5 format files (`.h5` extension). Note that the legacy SavedModel format is not supported by `load_model()` in Keras 3. In order to reload a TensorFlow SavedModel as an inference-only layer in Keras 3, use `keras.layers.TFSMLayer(model_5/, call_endpoint='serving_default')` (note that your `call_endpoint` might have a different name).

In [39]:
#model_load_test.__dict__
model_load_test.predict(X_test)

AttributeError: '_UserObject' object has no attribute 'predict'