# Sentence Classifier using 1D CNN

In [None]:
import matplotlib.pyplot as plt
import os
import re
import string
import shutil
import tensorflow as tf
from tensorflow.keras import layers, losses, callbacks, Sequential

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file(
    "aclImdb_v1" , url,
    untar=True, cache_dir='',
    cache_subdir=''
)
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [None]:
os.listdir(dataset_dir)

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

In [None]:
shutil.rmtree(os.path.join(train_dir, "unsup"))

In [None]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size=batch_size,
)


In [None]:
for x in iter(raw_test_ds):
    print(x)
    break

In [None]:
def custom_standardization(input_data):
    l_case = tf.strings.lower(input_data)
    stp_html = tf.strings.regex_replace(
        l_case, '<br/>', ' '
    )
    return tf.strings.regex_replace(
        stp_html,
        f'[{re.escape(string.punctuation)}]',
        ''
    )

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
max_features = 10000
seq_len = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=seq_len 
)

In [None]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
emb_dims = 128

model = Sequential([
    layers.Embedding(max_features+1, emb_dims),
    layers.Conv1D(16, 5, activation="relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1),
])

model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

In [None]:
model.fit(
    train_ds, 
    validation_data=val_ds,
    epochs=10,
    callbacks=[
        callbacks.TensorBoard(log_dir="logs/1dcnn"),
    ]
)

In [None]:
model.evaluate(test_ds)

In [None]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"]
)


export_model.evaluate(raw_test_ds)

In [None]:
reviews = tf.constant(['The movie is very boring', 'A Good Movie' , 'very bad worst movie',  'Worst movie, boring' ])
print("## Inference")
res = export_model(reviews)
for review, val in zip(reviews, res):
    review = review.numpy().decode()
    val = val.numpy().squeeze()
    print(f"{review:<30}:{val:>.3f}")

In [None]:
export_model.save('sentence_classificatoin_model.keras')