In [3]:
import pandas as pd
import numpy as np
import re

import tensorflow as tf
import autokeras as ak
from sklearn.metrics import accuracy_score, f1_score, classification_report


# --- read TSV
train_df = pd.read_csv("new_data_train_Yelp_Fake_Review.csv", sep="\t", engine="python")
test_df  = pd.read_csv("new_data_test_Yelp_Fake_Review.csv",  sep="\t", engine="python")

train_df = train_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)
test_df  = test_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)

# --- (light) text cleaning
_url_re = re.compile(r"http\S+|www\.\S+")
_space_re = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    s = _url_re.sub(" URL ", s)
    s = _space_re.sub(" ", s).strip()
    return s

x_train = train_df["reviewContent"].astype(str).map(clean_text).to_numpy()
y_train = train_df["flagged"].astype(int).to_numpy()

x_test  = test_df["reviewContent"].astype(str).map(clean_text).to_numpy()
y_test  = test_df["flagged"].astype(int).to_numpy()

print("TF GPUs:", tf.config.list_physical_devices("GPU"))
print("Train labels:", np.unique(y_train, return_counts=True))


# --- AutoKeras Text AutoML
clf = ak.TextClassifier(
    max_trials=10,        # increase (e.g., 30-50) for better results
    overwrite=True
)

clf.fit(
    x_train, y_train,
    validation_split=0.2,
    epochs=5              # increase (e.g., 10-20) for better results
)

# --- Evaluate
proba = clf.predict(x_test).reshape(-1)
y_pred = (proba >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

# --- Save
clf.export_model().save("autokeras_yelp_text_model", save_format="tf")
print("Saved: autokeras_yelp_text_model")


TF GPUs: []
Train labels: (array([0, 1]), array([4993, 4933]))


2025-12-18 13:04:31.919945: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-12-18 13:04:31.921520: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


TypeError: Expect the data to TextInput to be strings, but got object.

In [None]:
# =========================================
# 0) Imports
# =========================================
import pandas as pd
import numpy as np
import re

import tensorflow as tf
import autokeras as ak

from sklearn.metrics import accuracy_score, f1_score, classification_report


# =========================================
# 1) Read TRAIN / TEST (TSV)
# =========================================
train_df = pd.read_csv("new_data_train_Yelp_Fake_Review.csv", sep="\t", engine="python")
test_df  = pd.read_csv("new_data_test_Yelp_Fake_Review.csv",  sep="\t", engine="python")

train_df = train_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)
test_df  = test_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)

train_df["reviewContent"] = train_df["reviewContent"].astype(str)
test_df["reviewContent"]  = test_df["reviewContent"].astype(str)

train_df["flagged"] = train_df["flagged"].astype(int)
test_df["flagged"]  = test_df["flagged"].astype(int)

print("Train labels:", np.unique(train_df["flagged"], return_counts=True))
print("Test labels :", np.unique(test_df["flagged"],  return_counts=True))


# =========================================
# 2) Text cleaning
# =========================================
_url_re = re.compile(r"http\S+|www\.\S+")
_html_re = re.compile(r"<.*?>")
_space_re = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    s = _html_re.sub(" ", s)
    s = _url_re.sub(" URL ", s)
    s = _space_re.sub(" ", s).strip()
    return s

# IMPORTANT: AutoKeras TextClassifier wants 1D array/list of strings
x_train = train_df["reviewContent"].map(clean_text).astype(str).to_numpy(dtype=str)   # shape (N,)
y_train = train_df["flagged"].to_numpy()

x_test  = test_df["reviewContent"].map(clean_text).astype(str).to_numpy(dtype=str)    # shape (M,)
y_test  = test_df["flagged"].to_numpy()

print("x_train shape:", x_train.shape, "dtype:", x_train.dtype)
print("GPU devices:", tf.config.list_physical_devices("GPU"))


# =========================================
# 3) AutoKeras Text AutoML
# =========================================
clf = ak.TextClassifier(
    max_trials=30,     # increase for better results (30-50)
    overwrite=True
)

clf.fit(
    x_train,
    y_train,
    validation_split=0.3,
    epochs=5
)


# =========================================
# 4) Evaluation
# =========================================
# For binary classification, predict() often returns probabilities (N,1)
proba = clf.predict(x_test)
proba = np.array(proba).reshape(-1)

y_pred = (proba >= 0.5).astype(int)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=4))


# =========================================
# 5) Save trained model
# =========================================
model = clf.export_model()
model.save("autokeras_yelp_text_model.keras")
print(" Saved: autokeras_yelp_text_model.keras")



Trial 28 Complete [00h 00m 10s]
val_loss: 0.6728435754776001

Best val_loss So Far: 0.67254239320755
Total elapsed time: 00h 07m 29s

Search: Running Trial #29

Value             |Best Value So Far |Hyperparameter
500               |500               |text_block_1/max_tokens
32                |32                |text_block_1/embedding_1/embedding_dim
0                 |0                 |text_block_1/embedding_1/dropout
3                 |3                 |text_block_1/conv_block_1/kernel_size
False             |False             |text_block_1/conv_block_1/separable
False             |False             |text_block_1/conv_block_1/max_pooling
1                 |1                 |text_block_1/conv_block_1/num_blocks
2                 |2                 |text_block_1/conv_block_1/num_layers
512               |64                |text_block_1/conv_block_1/filters_0_0
512               |512               |text_block_1/conv_block_1/filters_0_1
0.25              |0.25              |text_block

In [6]:
model.save("autokeras_yelp_text_model.keras")
print(" Saved: autokeras_yelp_text_model.keras")


✅ Saved: autokeras_yelp_text_model.keras


In [9]:
# =========================================
# 0) Imports
# =========================================
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import autokeras as ak

from sklearn.metrics import accuracy_score, f1_score, classification_report


# =========================================
# 1) Read TRAIN / TEST (TSV)
# =========================================
train_df = pd.read_csv(
    "new_data_train_Yelp_Fake_Review.csv",
    sep="\t",
    engine="python"
)

test_df = pd.read_csv(
    "new_data_test_Yelp_Fake_Review.csv",
    sep="\t",
    engine="python"
)

train_df = train_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)
test_df  = test_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)

train_df["reviewContent"] = train_df["reviewContent"].astype(str)
test_df["reviewContent"]  = test_df["reviewContent"].astype(str)

train_df["flagged"] = train_df["flagged"].astype(int)
test_df["flagged"]  = test_df["flagged"].astype(int)

print("Train labels:", np.unique(train_df["flagged"], return_counts=True))
print("Test labels :", np.unique(test_df["flagged"], return_counts=True))
print("GPU devices:", tf.config.list_physical_devices("GPU"))


# =========================================
# 2) Text cleaning
# =========================================
_url_re = re.compile(r"http\S+|www\.\S+")
_html_re = re.compile(r"<.*?>")
_space_re = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    s = _html_re.sub(" ", s)
    s = _url_re.sub(" URL ", s)
    s = _space_re.sub(" ", s).strip()
    return s

train_df["review_clean"] = train_df["reviewContent"].map(clean_text)
test_df["review_clean"]  = test_df["reviewContent"].map(clean_text)


# =========================================
# 3) Structured feature engineering
# =========================================
def add_text_features(df: pd.DataFrame, col: str) -> pd.DataFrame:
    text = df[col].fillna("").astype(str)

    df["char_len"] = text.str.len()
    df["word_count"] = text.str.split().map(len)
    df["exclaim_count"] = text.str.count("!")
    df["question_count"] = text.str.count(r"\?")
    df["upper_count"] = text.str.count(r"[A-Z]")
    df["digit_count"] = text.str.count(r"\d")

    df["upper_ratio"] = df["upper_count"] / (df["char_len"] + 1)
    df["digit_ratio"] = df["digit_count"] / (df["char_len"] + 1)

    df["multi_exclaim"] = text.str.contains(r"!!+").astype(int)
    df["multi_question"] = text.str.contains(r"\?\?+").astype(int)

    return df

train_df = add_text_features(train_df, "review_clean")
test_df  = add_text_features(test_df,  "review_clean")

feature_cols = [
    "char_len", "word_count",
    "exclaim_count", "question_count",
    "upper_ratio", "digit_ratio",
    "multi_exclaim", "multi_question"
]

for c in feature_cols:
    train_df[c] = pd.to_numeric(train_df[c], errors="coerce").fillna(0)
    test_df[c]  = pd.to_numeric(test_df[c], errors="coerce").fillna(0)


# =========================================
# 4) Prepare inputs (CRITICAL SECTION)
# =========================================
# Text MUST be (N, 1)
x_train_text = (
    train_df["review_clean"]
    .astype(str)
    .to_numpy(dtype=str)
    .reshape(-1, 1)
)

x_test_text = (
    test_df["review_clean"]
    .astype(str)
    .to_numpy(dtype=str)
    .reshape(-1, 1)
)

# Structured MUST be NumPy arrays
x_train_struct = train_df[feature_cols].astype(np.float32).to_numpy()
x_test_struct  = test_df[feature_cols].astype(np.float32).to_numpy()

y_train = train_df["flagged"].to_numpy()
y_test  = test_df["flagged"].to_numpy()

print("Text input shape:", x_train_text.shape)
print("Struct input shape:", x_train_struct.shape)


# =========================================
# 5) Build MULTIMODAL AutoKeras model
# =========================================
text_in = ak.TextInput()
struct_in = ak.StructuredDataInput()

text_feat = ak.TextBlock()(text_in)
struct_feat = ak.StructuredDataBlock()(struct_in)

merged = ak.Merge()([text_feat, struct_feat])
out = ak.ClassificationHead(num_classes=2)(merged)

automodel = ak.AutoModel(
    inputs=[text_in, struct_in],
    outputs=out,
    max_trials=10,        # increase to 30–50 later
    overwrite=True
)


# =========================================
# 6) Train (LIST INPUT — IMPORTANT)
# =========================================
automodel.fit(
    x=[x_train_text, x_train_struct],
    y=y_train,
    validation_split=0.2,
    epochs=5              # increase to 10–20 later
)


# =========================================
# 7) Predict + Evaluate
# =========================================
pred = automodel.predict([x_test_text, x_test_struct])
pred = np.array(pred)

if pred.ndim == 2 and pred.shape[1] == 2:
    y_pred = np.argmax(pred, axis=1)
else:
    y_pred = (pred.reshape(-1) >= 0.5).astype(int)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=4))


# =========================================
# 8) Save best model (Keras 3 compatible)
# =========================================
best_model = automodel.export_model()
best_model.save("autokeras_yelp_multimodal_model.keras")

print("\n✅ Saved: autokeras_yelp_multimodal_model.keras")


Train labels: (array([0, 1]), array([4993, 4933]))
Test labels : (array([0, 1]), array([1212, 1271]))
GPU devices: []
Text input shape: (9926, 1)
Struct input shape: (9926, 8)


AttributeError: 'numpy.ndarray' object has no attribute 'split'

In [10]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import autokeras as ak

from sklearn.metrics import accuracy_score, f1_score, classification_report

# -----------------------------
# 1) Read TRAIN / TEST (TSV)
# -----------------------------
train_df = pd.read_csv("new_data_train_Yelp_Fake_Review.csv", sep="\t", engine="python")
test_df  = pd.read_csv("new_data_test_Yelp_Fake_Review.csv",  sep="\t", engine="python")

train_df = train_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)
test_df  = test_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)

train_df["reviewContent"] = train_df["reviewContent"].astype(str)
test_df["reviewContent"]  = test_df["reviewContent"].astype(str)
train_df["flagged"] = train_df["flagged"].astype(int)
test_df["flagged"]  = test_df["flagged"].astype(int)

print("Train labels:", np.unique(train_df["flagged"], return_counts=True))
print("Test labels :", np.unique(test_df["flagged"], return_counts=True))
print("GPU devices:", tf.config.list_physical_devices("GPU"))

# -----------------------------
# 2) Text cleaning (light)
# -----------------------------
_url_re = re.compile(r"http\S+|www\.\S+")
_html_re = re.compile(r"<.*?>")
_space_re = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    s = _html_re.sub(" ", s)
    s = _url_re.sub(" URL ", s)
    s = _space_re.sub(" ", s).strip()
    return s

train_df["review_clean"] = train_df["reviewContent"].map(clean_text)
test_df["review_clean"]  = test_df["reviewContent"].map(clean_text)

# -----------------------------
# 3) Structured feature engineering
# -----------------------------
def add_text_features(df: pd.DataFrame, col: str) -> pd.DataFrame:
    text = df[col].fillna("").astype(str)

    df["char_len"] = text.str.len()
    df["word_count"] = text.str.split().map(len)

    df["exclaim_count"] = text.str.count("!")
    df["question_count"] = text.str.count(r"\?")
    df["upper_count"] = text.str.count(r"[A-Z]")
    df["digit_count"] = text.str.count(r"\d")

    df["upper_ratio"] = df["upper_count"] / (df["char_len"] + 1)
    df["digit_ratio"] = df["digit_count"] / (df["char_len"] + 1)

    df["multi_exclaim"] = text.str.contains(r"!!+").astype(int)
    df["multi_question"] = text.str.contains(r"\?\?+").astype(int)

    return df

train_df = add_text_features(train_df, "review_clean")
test_df  = add_text_features(test_df,  "review_clean")

feature_cols = [
    "char_len", "word_count", "exclaim_count", "question_count",
    "upper_ratio", "digit_ratio", "multi_exclaim", "multi_question"
]

for c in feature_cols:
    train_df[c] = pd.to_numeric(train_df[c], errors="coerce").fillna(0)
    test_df[c]  = pd.to_numeric(test_df[c], errors="coerce").fillna(0)

# -----------------------------
# 4) Prepare inputs for AutoKeras (FIXED)
#   - Text MUST be 1D array of strings (N,)
#   - Structured MUST be numpy float array (N, 8)
# -----------------------------
x_train_text = train_df["review_clean"].astype(str).to_numpy(dtype=str)     # (N,)
x_test_text  = test_df["review_clean"].astype(str).to_numpy(dtype=str)      # (M,)

x_train_struct = train_df[feature_cols].astype(np.float32).to_numpy()       # (N, 8)
x_test_struct  = test_df[feature_cols].astype(np.float32).to_numpy()        # (M, 8)

y_train = train_df["flagged"].to_numpy()
y_test  = test_df["flagged"].to_numpy()

print("Text shape:", x_train_text.shape, "dtype:", x_train_text.dtype)
print("Struct shape:", x_train_struct.shape, "dtype:", x_train_struct.dtype)

# -----------------------------
# 5) Build Multi-Input AutoKeras model (FIXED)
#   - Use scalar text input: shape=()
#   - Use named inputs and feed a dict
# -----------------------------
text_in = ak.TextInput(name="text", shape=())
struct_in = ak.StructuredDataInput(name="struct")

text_feat = ak.TextBlock()(text_in)
struct_feat = ak.StructuredDataBlock()(struct_in)

merged = ak.Merge()([text_feat, struct_feat])
out = ak.ClassificationHead(num_classes=2)(merged)

automodel = ak.AutoModel(
    inputs=[text_in, struct_in],
    outputs=out,
    max_trials=10,      # increase later (30-50)
    overwrite=True
)

# -----------------------------
# 6) Train (DICT with named inputs)
# -----------------------------
automodel.fit(
    x={"text": x_train_text, "struct": x_train_struct},
    y=y_train,
    validation_split=0.2,
    epochs=5
)

# -----------------------------
# 7) Predict + Evaluate
# -----------------------------
pred = automodel.predict({"text": x_test_text, "struct": x_test_struct})
pred = np.array(pred)

if pred.ndim == 2 and pred.shape[1] == 2:
    y_pred = np.argmax(pred, axis=1)
else:
    y_pred = (pred.reshape(-1) >= 0.5).astype(int)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))

# -----------------------------
# 8) Save best model (Keras 3)
# -----------------------------
best_model = automodel.export_model()
best_model.save("autokeras_yelp_multimodal_model.keras")
print("\n✅ Saved: autokeras_yelp_multimodal_model.keras")


Train labels: (array([0, 1]), array([4993, 4933]))
Test labels : (array([0, 1]), array([1212, 1271]))
GPU devices: []
Text shape: (9926,) dtype: <U3676
Struct shape: (9926, 8) dtype: float32


ValueError: Expect the data to TextInput to have shape (batch_size, 1), but got input shape [9926, 8].