<a href="https://colab.research.google.com/github/WAQAR-AK/Daily-Coach/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import os
import re
import tarfile
import json
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np

# ----------------------------
# Config
# ----------------------------
ARCHIVE_PATH = "/content/domain_sentiment_data.tar.gz"
EXTRACT_DIR  = "/content/_extracted_reviews"
OUTPUT_CSV   = "/content/cleaned_reviews.csv"
OUTPUT_NPZ   = "/content/padded_sequences.npz"
OUTPUT_VOCAB = "/content/tokenizer_vocab.json"

RANDOM_SEED = 42
MAX_WORDS   = 20000          # top words to keep in tokenizer
OOV_TOKEN   = "<OOV>"
DO_SPELLCHECK = False        # set True to enable spelling correction
OUTLIER_MIN_TOKENS = 5       # remove reviews with fewer tokens
TRIVIAL_SET = {"ok","okay","bad","good","nice","fine","meh","cool","great","super"}


In [30]:
def get_stopwords():
    STOPWORDS_FALLBACK = {
        "a","an","the","and","or","but","if","while","of","at","by","for","with","about","against",
        "between","into","through","during","before","after","to","from","in","out","on","off","over","under",
        "again","further","then","once","here","there","when","where","why","how","all","any","both","each",
        "few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very",
        "can","will","just","should","now","is","am","are","was","were","be","been","being","do","does",
        "did","having","have","has","had","he","she","it","they","them","their","theirs","you","your","yours",
        "i","me","my","mine","we","us","our","ours","this","that","these","those"
    }
    try:
        from nltk.corpus import stopwords
        return set(stopwords.words("english"))
    except Exception:
        return STOPWORDS_FALLBACK

def get_spellchecker():
    if not DO_SPELLCHECK:
        return None
    try:
        from spellchecker import SpellChecker
        return SpellChecker(distance=2)
    except Exception:
        return None

STOPWORDS = get_stopwords()
SPELLER = get_spellchecker()


In [31]:
SPACE_RE = re.compile(r"\s+")
NON_ALNUM_RE = re.compile(r"[^a-z0-9\s]")

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    t = text.lower()
    t = NON_ALNUM_RE.sub(" ", t)
    t = SPACE_RE.sub(" ", t).strip()
    tokens = [w for w in t.split() if w not in STOPWORDS]
    if SPELLER is not None and tokens:
        corrected = []
        for w in tokens:
            if len(w) <= 2 or w.isdigit():
                corrected.append(w)
            else:
                corrected.append(SPELLER.correction(w) or w)
        tokens = corrected
    return " ".join(tokens)


In [32]:
def extract_archive(archive_path: str, extract_dir: str):
    Path(extract_dir).mkdir(parents=True, exist_ok=True)
    with tarfile.open(archive_path, "r:gz") as tar:
        safe_members = []
        for m in tar.getmembers():
            if not os.path.isabs(m.name) and ".." not in Path(m.name).parts:
                safe_members.append(m)
        tar.extractall(path=extract_dir, members=safe_members)

def read_text_file(fp: str) -> str:
    try:
        with open(fp, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    except Exception:
        with open(fp, "r", encoding="latin-1", errors="ignore") as f:
            return f.read()

def infer_label_from_path(p: Path):
    pstr = str(p).lower()
    if "positive" in pstr or "pos" in pstr:
        return "positive"
    if "negative" in pstr or "neg" in pstr:
        return "negative"
    return None

def infer_domain_from_path(p: Path):
    parts = [x.lower() for x in p.parts]
    candidates = {"books","dvd","electronics","kitchen","housewares"}
    for comp in parts[::-1]:
        if comp in candidates:
            return comp
    return p.parent.name.lower()

def load_dataset(extracted_root: str) -> pd.DataFrame:
    rows = []
    for root, _, files in os.walk(extracted_root):
        for fname in files:
            fp = Path(root) / fname
            low = fname.lower()
            if low.endswith(".csv"):
                try:
                    df = pd.read_csv(fp)
                    text_col = [c for c in df.columns if "text" in c.lower()][0]
                    label_col = [c for c in df.columns if "label" in c.lower()][0]
                    tmp = pd.DataFrame({
                        "text": df[text_col].astype(str),
                        "label": df[label_col],
                        "domain": infer_domain_from_path(fp)
                    })
                    rows.append(tmp)
                except Exception:
                    continue
            elif low.endswith((".txt",".review",".data")):
                label = infer_label_from_path(fp)
                text = read_text_file(fp)
                rows.append(pd.DataFrame({
                    "text":[text],
                    "label":[label],
                    "domain":[infer_domain_from_path(fp)]
                }))
    return pd.concat(rows, ignore_index=True)

# Extract and load
extract_archive(ARCHIVE_PATH, EXTRACT_DIR)
df = load_dataset(EXTRACT_DIR)
print("Raw dataset size:", len(df))


  tar.extractall(path=extract_dir, members=safe_members)


Raw dataset size: 11


In [33]:
df = df[df["text"].notna()].copy()
df = df[df["label"].isin(["positive","negative"])].copy()

df["text_clean"] = df["text"].apply(clean_text)
df["label_enc"] = df["label"].map({"positive":1,"negative":0}).astype("int8")

print("After cleaning:", len(df))
df.head()


After cleaning: 8


Unnamed: 0,text,label,domain,text_clean,label_enc
0,"<review>\n<unique_id>\nB000E33VZE:skip_color,_...",positive,dvd,review unique id b000e33vze skip color go dire...,1
2,<review>\n<unique_id>\nB00064LJVE:one_of_the_w...,negative,dvd,review unique id b00064ljve one worst movies e...,0
3,<review>\n<unique_id>\n0785758968:one_of_the_b...,positive,books,review unique id 0785758968 one best crichton ...,1
4,<review>\n<unique_id>\n0312355645:horrible_boo...,negative,books,review unique id 0312355645 horrible book horr...,0
5,<review>\n<unique_id>\nB00006HYUB:everyone_sho...,positive,electronics,review unique id b00006hyub everyone one d joh...,1


In [34]:
token_counts = df["text_clean"].str.split().apply(len)
trivial_mask = df["text_clean"].isin(TRIVIAL_SET)
keep_mask = (token_counts >= OUTLIER_MIN_TOKENS) & (~trivial_mask)

removed = (~keep_mask).sum()
df = df[keep_mask].reset_index(drop=True)

print(f"Removed {removed} ultra-short/trivial reviews.")
print("Remaining:", len(df))


Removed 0 ultra-short/trivial reviews.
Remaining: 8


In [35]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences as keras_pad

class SimpleTokenizer:
    def __init__(self, num_words=MAX_WORDS, oov_token=OOV_TOKEN):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index = {}
        self.index_word = {}
    def fit_on_texts(self, texts):
        counts = Counter()
        for t in texts:
            for w in str(t).split():
                counts[w] += 1
        vocab = [w for w,_ in counts.most_common(self.num_words - 2)]
        self.word_index = {self.oov_token:1}
        idx = 2
        for w in vocab:
            if w not in self.word_index:
                self.word_index[w] = idx
                idx += 1
        self.index_word = {i:w for w,i in self.word_index.items()}
    def texts_to_sequences(self, texts):
        wi = self.word_index
        oov = wi.get(self.oov_token, 1)
        return [[wi.get(w, oov) for w in str(t).split()] for t in texts]

def pad_sequences(sequences, maxlen, padding="post", truncating="post", value=0):
    padded = np.full((len(sequences), maxlen), value, dtype=np.int32)
    for i, seq in enumerate(sequences):
        if not seq:
            continue
        if len(seq) <= maxlen:
            if padding == "post":
                padded[i, :len(seq)] = seq
            else:
                padded[i, -len(seq):] = seq
        else:
            if truncating == "post":
                trimmed = seq[:maxlen]
            else:
                trimmed = seq[-maxlen:]
            padded[i, :] = trimmed
    return padded

# TensorFlow Keras
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token=OOV_TOKEN, filters="")
tokenizer.fit_on_texts(df["text_clean"].tolist())
sequences = tokenizer.texts_to_sequences(df["text_clean"].tolist())
lengths = np.array([len(s) for s in sequences if s])
maxlen = int(np.percentile(lengths, 95))
X = keras_pad(sequences, maxlen=maxlen, padding="post", truncating="post")
vocab_to_save = tokenizer.word_index
tokenizer_type = "keras"

y = df["label_enc"].values.astype(np.int8)
print("Final padded shape:", X.shape)


Final padded shape: (8, 194098)


In [36]:
cols = ["domain","label","label_enc","text","text_clean"]
df[cols].to_csv(OUTPUT_CSV, index=False)

np.savez_compressed(OUTPUT_NPZ, X=X, y=y, maxlen=maxlen)

with open(OUTPUT_VOCAB, "w", encoding="utf-8") as f:
    json.dump({
        "tokenizer_type": tokenizer_type,
        "oov_token": OOV_TOKEN,
        "max_words": MAX_WORDS,
        "maxlen": int(maxlen),
        "vocab": vocab_to_save
    }, f)

print("✅ Saved:")
print("CSV:", OUTPUT_CSV)
print("NPZ:", OUTPUT_NPZ)
print("Vocab JSON:", OUTPUT_VOCAB)


✅ Saved:
CSV: /content/cleaned_reviews.csv
NPZ: /content/padded_sequences.npz
Vocab JSON: /content/tokenizer_vocab.json


In [37]:
import json, numpy as np, tensorflow as tf, random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras import layers, models, callbacks, optimizers, losses, metrics

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)


In [38]:
npz = np.load("/content/padded_sequences.npz")
X, y, maxlen = npz["X"], npz["y"], int(npz["maxlen"])
with open("/content/tokenizer_vocab.json","r",encoding="utf-8") as f:
    vocab_meta = json.load(f)

vocab_size = int(X.max()) + 1  # ensures Embedding can index all tokens present
print(X.shape, y.shape, "maxlen:", maxlen, "vocab_size:", vocab_size)


(8, 194098) (8,) maxlen: 194098 vocab_size: 20000


In [39]:
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np

# 80% train, 20% holdout (always stratified)
X_train, X_hold, y_train, y_hold = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_SEED
)

# Only stratify the second split if each class has at least 2 samples in the holdout
can_stratify_second = np.bincount(y_hold).min() >= 2
X_val, X_test, y_val, y_test = train_test_split(
    X_hold, y_hold, test_size=0.50,
    stratify=y_hold if can_stratify_second else None,
    random_state=RANDOM_SEED
)

print("Second split stratified:", can_stratify_second)
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Class counts:", {"train": Counter(y_train), "val": Counter(y_val), "test": Counter(y_test)})


Second split stratified: False
Shapes: (6, 194098) (1, 194098) (1, 194098)
Class counts: {'train': Counter({np.int8(1): 3, np.int8(0): 3}), 'val': Counter({np.int8(0): 1}), 'test': Counter({np.int8(1): 1})}


In [40]:
from sklearn.utils.class_weight import compute_class_weight
cw = compute_class_weight(class_weight="balanced", classes=np.array([0,1]), y=y_train)
class_weight = {0: float(cw[0]), 1: float(cw[1])}
class_weight


{0: 1.0, 1: 1.0}

In [41]:
embedding_dim = 128
model = models.Sequential([
    layers.Input(shape=(maxlen,)),
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=False),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid")
])
model.summary()


In [42]:
model.compile(
    optimizer=optimizers.Adam(1e-3),
    loss=losses.BinaryCrossentropy(),
    metrics=[metrics.BinaryAccuracy(name="accuracy"), metrics.AUC(name="auc")]
)


In [43]:
es = callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
ckpt = callbacks.ModelCheckpoint("/content/best_model.keras", monitor="val_loss", save_best_only=True)


In [44]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[es, ckpt],
    class_weight=class_weight  # remove this arg if you skipped step 4
)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.5000 - auc: 0.5000 - loss: 0.6937 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7231
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.5000 - auc: 0.6111 - loss: 0.6915 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7776
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.6667 - auc: 0.6667 - loss: 0.6863 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.8282


In [45]:
test_metrics = model.evaluate(X_test, y_test, verbose=0)
print(dict(zip(model.metrics_names, test_metrics)))
y_prob = model.predict(X_test, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))


{'loss': 0.6635575890541077, 'compile_metrics': 1.0}




              precision    recall  f1-score   support

           1     1.0000    1.0000    1.0000         1

    accuracy                         1.0000         1
   macro avg     1.0000    1.0000    1.0000         1
weighted avg     1.0000    1.0000    1.0000         1

[[1]]




In [46]:
model.save("/content/sentiment_model_final.keras")
print("Saved:", "/content/best_model.keras", "and", "/content/sentiment_model_final.keras")


Saved: /content/best_model.keras and /content/sentiment_model_final.keras


In [47]:
import tensorflow as tf
from tensorflow.data import AUTOTUNE

# If RANDOM_SEED wasn't defined in your session:
try:
    RANDOM_SEED
except NameError:
    RANDOM_SEED = 42

BATCH_SIZE = 64

def make_ds(X, y, training=False):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if training:
        ds = ds.shuffle(buffer_size=min(len(X), 10_000), seed=RANDOM_SEED, reshuffle_each_iteration=True)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_ds(X_train, y_train, training=True)
val_ds   = make_ds(X_val,   y_val,   training=False)
test_ds  = make_ds(X_test,  y_test,  training=False)

for xb, yb in train_ds.take(1):
    print("One training batch:", xb.shape, yb.shape)


One training batch: (6, 194098) (6,)


In [48]:
import tensorflow as tf
from tensorflow.keras import layers

# ensure seeds exist
try:
    RANDOM_SEED
except NameError:
    RANDOM_SEED = 42
tf.random.set_seed(RANDOM_SEED)

# make sure these exist (from earlier steps)
try:
    vocab_size
except NameError:
    import numpy as np
    vocab_size = int(np.max(X)) + 1
try:
    maxlen
except NameError:
    maxlen = X.shape[1]

EMBED_DIM  = 128
LSTM_UNITS = 64
DROPOUT    = 0.30

class SentimentModel(tf.keras.Model):
    def __init__(self, vocab_size, embed_dim, lstm_units, dropout):
        super().__init__()
        self.embedding = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
            mask_zero=True,          # let LSTM ignore padding (0s)
            name="embedding"
        )
        self.encoder = layers.Bidirectional(
            layers.LSTM(lstm_units, return_sequences=False, dropout=0.20),
            name="bi_lstm"
        )
        self.hidden   = layers.Dense(64, activation="relu", name="dense_relu")
        self.dropout  = layers.Dropout(dropout, name="dropout")
        self.out      = layers.Dense(1, activation="sigmoid", name="out")

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.encoder(x, training=training)
        x = self.hidden(x)
        x = self.dropout(x, training=training)
        return self.out(x)

# Instantiate the network
model = SentimentModel(vocab_size=vocab_size, embed_dim=EMBED_DIM, lstm_units=LSTM_UNITS, dropout=DROPOUT)
model.build(input_shape=(None, maxlen))
model.summary()




In [49]:
import tensorflow as tf
from tensorflow.keras import layers

# (Re)build lightweight, cached datasets — faster per epoch
try:
    RANDOM_SEED
except NameError:
    RANDOM_SEED = 42

BATCH_SIZE = 128

def make_ds_fast(X, y, training=False):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if training:
        ds = ds.shuffle(buffer_size=min(len(X), 10_000), seed=RANDOM_SEED, reshuffle_each_iteration=True)
    return ds.batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)

train_ds = make_ds_fast(X_train, y_train, training=True)
val_ds   = make_ds_fast(X_val,   y_val)
test_ds  = make_ds_fast(X_test,  y_test)

# Small, fast model (bag-of-embeddings)
embedding_dim = 64
model_fast = tf.keras.Sequential([
    layers.Input(shape=(maxlen,)),
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")
])

model_fast.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.AUC(name="auc")]
)

es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=1, restore_best_weights=True)

history = model_fast.fit(train_ds, validation_data=val_ds, epochs=3, callbacks=[es], verbose=1)

# Evaluate quickly
test_metrics = model_fast.evaluate(test_ds, verbose=0)
print("Test metrics:", dict(zip(model_fast.metrics_names, test_metrics)))

# Use this as the current model for the next steps
model = model_fast


Epoch 1/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5000 - auc: 0.6667 - loss: 0.6925



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.5000 - auc: 0.6667 - loss: 0.6925 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7247
Epoch 2/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 931ms/step - accuracy: 0.3333 - auc: 0.3333 - loss: 0.7067 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7036
Epoch 3/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 771ms/step - accuracy: 0.6667 - auc: 0.5556 - loss: 0.6910 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.6818
Test metrics: {'loss': 0.7027067542076111, 'compile_metrics': 0.0}


In [57]:
# === Gradio inference cell (uses in-memory model & the SAME cleaner as training) ===
import sys, subprocess, json, tensorflow as tf, numpy as np
try:
    import gradio as gr
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "gradio"])
    import gradio as gr

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences as keras_pad

# --- Load tokenizer metadata saved during preprocessing ---
with open("/content/tokenizer_vocab.json", "r", encoding="utf-8") as f:
    meta = json.load(f)

vocab     = meta["vocab"]                 # word -> index mapping
oov_token = meta.get("oov_token", "<OOV>")
max_words = int(meta.get("max_words", 20000))
maxlen    = int(meta.get("maxlen", 100))

# --- Reconstruct tokenizer exactly like training ---
tok = Tokenizer(num_words=max_words, oov_token=oov_token, filters="")
tok.word_index = vocab
tok.index_word = {i: w for w, i in vocab.items()}  # reverse mapping
tok.num_words  = max_words                          # IMPORTANT: cap vocab like training

# --- Use the model currently in memory (freshly trained) ---
infer_model = model
print("Using in-memory model for inference.")

# NOTE: this cell expects that `clean_text` (your training cleaner) is already defined.
# If you get NameError: clean_text, re-run the cell where you defined it.

def predict_sentiment(user_text, threshold=0.5):
    cleaned = clean_text(user_text)  # reuse the EXACT same cleaner from training
    seq = tok.texts_to_sequences([cleaned])
    X_in = keras_pad(seq, maxlen=maxlen, padding="post", truncating="post")
    prob = float(infer_model.predict(X_in, verbose=0).ravel()[0])
    label = "positive" if prob >= threshold else "negative"
    return label, prob, cleaned

demo = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=3, placeholder="Type a product review..."),
    outputs=[
        gr.Textbox(label="Predicted label"),
        gr.Number(label="Score (probability of positive)"),
        gr.Textbox(label="Cleaned input used for model")
    ],
    title="Sentiment Demo",
    description="Same cleaner & tokenizer as training. Negations preserved; predictions should vary with input."
)

demo.launch(share=True)


Using in-memory model for inference.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://932a650dd9a6c7c501.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [51]:
import tensorflow as tf

model.compile(
    optimizer=tf.keras.optimizers.Adam(5e-4),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.AUC(name="auc")]
)
es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
history = model.fit(train_ds, validation_data=val_ds, epochs=6, callbacks=[es], verbose=1)
print("Re-trained. Now evaluating:")
print(dict(zip(model.metrics_names, model.evaluate(test_ds, verbose=0))))


Epoch 1/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.6667 - auc: 0.8889 - loss: 0.6785 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7015
Epoch 2/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 807ms/step - accuracy: 0.5000 - auc: 0.3333 - loss: 0.7042 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7065
Epoch 3/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 758ms/step - accuracy: 0.6667 - auc: 1.0000 - loss: 0.6700 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7056
Re-trained. Now evaluating:
{'loss': 0.6828196048736572, 'compile_metrics': 1.0}


In [52]:
import tensorflow as tf
from tensorflow.keras import layers

cnn = tf.keras.Sequential([
    layers.Input(shape=(maxlen,)),
    layers.Embedding(vocab_size, 128),
    layers.Conv1D(128, 5, padding="same", activation="relu"),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid"),
])

cnn.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
            loss="binary_crossentropy",
            metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.AUC(name="auc")])
es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
hist = cnn.fit(train_ds, validation_data=val_ds, epochs=6, callbacks=[es], verbose=1)

print("CNN test metrics:", dict(zip(cnn.metrics_names, cnn.evaluate(test_ds, verbose=0))))
model = cnn  # use CNN for subsequent inference if it wins


Epoch 1/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22s/step - accuracy: 0.3333 - auc: 0.2222 - loss: 0.7147 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7386
Epoch 2/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/step - accuracy: 0.6667 - auc: 0.9444 - loss: 0.6322 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7281
Epoch 3/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16s/step - accuracy: 0.8333 - auc: 1.0000 - loss: 0.6257 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.7141
Epoch 4/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21s/step - accuracy: 1.0000 - auc: 1.0000 - loss: 0.5615 - val_accuracy: 0.0000e+00 - val_auc: 0.0000e+00 - val_loss: 0.6937
Epoch 5/6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20s/step - accuracy: 1.0000 - auc: 1.0000 - loss: 0.5643 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.6719
Epoch 6/6
[1m