In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, f1_score, classification_report

import h2o
from h2o.automl import H2OAutoML


# -----------------------------
# 1) Read TRAIN / TEST (TSV)
# -----------------------------
train_df = pd.read_csv("new_data_train_Yelp_Fake_Review.csv", sep="\t", engine="python")
test_df  = pd.read_csv("new_data_test_Yelp_Fake_Review.csv",  sep="\t", engine="python")

train_df = train_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)
test_df  = test_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)

train_df["reviewContent"] = train_df["reviewContent"].astype(str)
test_df["reviewContent"]  = test_df["reviewContent"].astype(str)
train_df["flagged"] = train_df["flagged"].astype(int)
test_df["flagged"]  = test_df["flagged"].astype(int)

print("Train labels:", np.unique(train_df["flagged"], return_counts=True))
print("Test labels :", np.unique(test_df["flagged"],  return_counts=True))


# -----------------------------
# 2) Light text cleaning
# -----------------------------
_url_re = re.compile(r"http\S+|www\.\S+")
_html_re = re.compile(r"<.*?>")
_space_re = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    s = _html_re.sub(" ", s)
    s = _url_re.sub(" URL ", s)
    s = _space_re.sub(" ", s).strip()
    return s

train_df["text"] = train_df["reviewContent"].map(clean_text)
test_df["text"]  = test_df["reviewContent"].map(clean_text)


# -----------------------------
# 3) Structured feature engineering
# -----------------------------
def add_text_features(df: pd.DataFrame, col: str) -> pd.DataFrame:
    text = df[col].fillna("").astype(str)

    df["char_len"] = text.str.len()
    df["word_count"] = text.str.split().map(len)

    df["exclaim_count"] = text.str.count("!")
    df["question_count"] = text.str.count(r"\?")
    df["upper_count"] = text.str.count(r"[A-Z]")
    df["digit_count"] = text.str.count(r"\d")

    df["upper_ratio"] = df["upper_count"] / (df["char_len"] + 1)
    df["digit_ratio"] = df["digit_count"] / (df["char_len"] + 1)

    df["multi_exclaim"] = text.str.contains(r"!!+").astype(int)
    df["multi_question"] = text.str.contains(r"\?\?+").astype(int)
    return df

train_df = add_text_features(train_df, "text")
test_df  = add_text_features(test_df,  "text")

struct_cols = [
    "char_len","word_count","exclaim_count","question_count",
    "upper_ratio","digit_ratio","multi_exclaim","multi_question"
]

for c in struct_cols:
    train_df[c] = pd.to_numeric(train_df[c], errors="coerce").fillna(0)
    test_df[c]  = pd.to_numeric(test_df[c],  errors="coerce").fillna(0)


# -----------------------------
# 4) Text -> TF-IDF -> SVD embeddings (multimodal fusion via features)
# -----------------------------
# TF-IDF (char ngrams often strong for spam/fake)
tfidf = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=2,
    max_features=200000,
    sublinear_tf=True
)

X_tfidf_train = tfidf.fit_transform(train_df["text"])
X_tfidf_test  = tfidf.transform(test_df["text"])

# Reduce dimensionality for H2O (dense numeric matrix)
svd_dim = 300  # try 100/300/500
svd = TruncatedSVD(n_components=svd_dim, random_state=42)

X_svd_train = svd.fit_transform(X_tfidf_train)
X_svd_test  = svd.transform(X_tfidf_test)

svd_cols = [f"svd_{i}" for i in range(svd_dim)]
svd_train_df = pd.DataFrame(X_svd_train, columns=svd_cols)
svd_test_df  = pd.DataFrame(X_svd_test,  columns=svd_cols)

# Combine: [structured] + [svd embeddings] + label
train_final = pd.concat([train_df[struct_cols].reset_index(drop=True),
                         svd_train_df.reset_index(drop=True),
                         train_df[["flagged"]].reset_index(drop=True)], axis=1)

test_final  = pd.concat([test_df[struct_cols].reset_index(drop=True),
                         svd_test_df.reset_index(drop=True),
                         test_df[["flagged"]].reset_index(drop=True)], axis=1)


# -----------------------------
# 5) H2O AutoML training
# -----------------------------
h2o.init(max_mem_size="8G", nthreads=-1)  # adjust memory if needed :contentReference[oaicite:3]{index=3}

hf_train = h2o.H2OFrame(train_final)
hf_test  = h2o.H2OFrame(test_final)

# Make label categorical for classification
hf_train["flagged"] = hf_train["flagged"].asfactor()
hf_test["flagged"]  = hf_test["flagged"].asfactor()

x_cols = [c for c in hf_train.columns if c != "flagged"]
y_col = "flagged"

aml = H2OAutoML(
    max_models=20,          # or use max_runtime_secs
    seed=42,
    sort_metric="AUC",      # good default for binary
    nfolds=5
)

aml.train(x=x_cols, y=y_col, training_frame=hf_train)

print(aml.leaderboard.head())


# -----------------------------
# 6) Evaluate on test set
# -----------------------------
pred = aml.leader.predict(hf_test)   # returns h2o frame with "predict", "p0", "p1"
pred_df = pred.as_data_frame()

y_pred = (pred_df["predict"].astype(str) == "1").astype(int).values
y_true = test_final["flagged"].astype(int).values

print("\nAccuracy:", accuracy_score(y_true, y_pred))
print("F1:", f1_score(y_true, y_pred))
print("\nReport:\n", classification_report(y_true, y_pred, digits=4))

# (Optional) Save leader model
model_path = h2o.save_model(aml.leader, path="h2o_models", force=True)
print("\n✅ Saved leader model to:", model_path)

# shutdown optional
# h2o.shutdown(prompt=False)


Train labels: (array([0, 1]), array([4993, 4933]))
Test labels : (array([0, 1]), array([1212, 1271]))
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.17-internal" 2025-10-21; OpenJDK Runtime Environment (build 17.0.17-internal+0-adhoc..src); OpenJDK 64-Bit Server VM (build 17.0.17-internal+0-adhoc..src, mixed mode, sharing)
  Starting server from /home/michael/tpot_jupyter/miniconda3/envs/h2o_env/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpoc1qaq7a
  JVM stdout: /tmp/tmpoc1qaq7a/h2o_michael_started_from_python.out
  JVM stderr: /tmp/tmpoc1qaq7a/h2o_michael_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,00 secs
H2O_cluster_timezone:,Europe/Rome
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.9
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_michael_9wlyyc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
11:49:44.544: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

█
11:49:48.744: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

██
11:49:55.223: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

██
11:50:02.564: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

█
11:50:06.155: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

██
11:50:33.181: _train param, Dropping bad and constant columns: [question_count




In [3]:
###Text-only H2O AutoML

In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, f1_score, classification_report

import h2o
from h2o.automl import H2OAutoML

# -----------------------------
# A) Read TRAIN / TEST (TSV)
# -----------------------------
train_df = pd.read_csv("new_data_train_Yelp_Fake_Review.csv", sep="\t", engine="python")
test_df  = pd.read_csv("new_data_test_Yelp_Fake_Review.csv",  sep="\t", engine="python")

train_df = train_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)
test_df  = test_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)

train_df["reviewContent"] = train_df["reviewContent"].astype(str)
test_df["reviewContent"]  = test_df["reviewContent"].astype(str)
train_df["flagged"] = train_df["flagged"].astype(int)
test_df["flagged"]  = test_df["flagged"].astype(int)

print("Train labels:", np.unique(train_df["flagged"], return_counts=True))
print("Test labels :", np.unique(test_df["flagged"],  return_counts=True))

# -----------------------------
# B) Clean text
# -----------------------------
_url_re = re.compile(r"http\S+|www\.\S+")
_html_re = re.compile(r"<.*?>")
_space_re = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    s = _html_re.sub(" ", s)
    s = _url_re.sub(" URL ", s)
    s = _space_re.sub(" ", s).strip()
    return s

train_text = train_df["reviewContent"].map(clean_text)
test_text  = test_df["reviewContent"].map(clean_text)

# -----------------------------
# C) TF-IDF -> SVD (text embeddings)
# -----------------------------
tfidf = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=2,
    max_features=200000,
    sublinear_tf=True
)

X_tfidf_train = tfidf.fit_transform(train_text)
X_tfidf_test  = tfidf.transform(test_text)

svd_dim = 300  # try 100, 300, 500
svd = TruncatedSVD(n_components=svd_dim, random_state=42)

X_svd_train = svd.fit_transform(X_tfidf_train)
X_svd_test  = svd.transform(X_tfidf_test)

svd_cols = [f"svd_{i}" for i in range(svd_dim)]
train_final = pd.DataFrame(X_svd_train, columns=svd_cols)
test_final  = pd.DataFrame(X_svd_test,  columns=svd_cols)

train_final["flagged"] = train_df["flagged"].values
test_final["flagged"]  = test_df["flagged"].values

# -----------------------------
# D) H2O AutoML
# -----------------------------
h2o.init(max_mem_size="8G", nthreads=-1)

hf_train = h2o.H2OFrame(train_final)
hf_test  = h2o.H2OFrame(test_final)

hf_train["flagged"] = hf_train["flagged"].asfactor()
hf_test["flagged"]  = hf_test["flagged"].asfactor()

x_cols = [c for c in hf_train.columns if c != "flagged"]
y_col = "flagged"

aml = H2OAutoML(
    max_models=20,
    seed=42,
    sort_metric="AUC",
    nfolds=5
)
aml.train(x=x_cols, y=y_col, training_frame=hf_train)

print(aml.leaderboard.head())

# -----------------------------
# E) Evaluate
# -----------------------------
pred = aml.leader.predict(hf_test).as_data_frame()
y_pred = (pred["predict"].astype(str) == "1").astype(int).values
y_true = test_final["flagged"].astype(int).values

print("\nAccuracy:", accuracy_score(y_true, y_pred))
print("F1:", f1_score(y_true, y_pred))
print("\nReport:\n", classification_report(y_true, y_pred, digits=4))

# Save model
model_path = h2o.save_model(aml.leader, path="h2o_models_text_only", force=True)
print("\n✅ Saved leader model to:", model_path)


Train labels: (array([0, 1]), array([4993, 4933]))
Test labels : (array([0, 1]), array([1212, 1271]))
Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,26 mins 24 secs
H2O_cluster_timezone:,Europe/Rome
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.9
H2O_cluster_version_age:,24 days
H2O_cluster_name:,H2O_from_python_michael_9wlyyc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.806 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                      auc    logloss     aucpr    mean_per_class_error      rmse       mse
StackedEnsemble_AllModels_1_AutoML_2_20251219_121607     0.73236    0.605165  0.71174                 0.353016  0.457239  0.209068
StackedEnsemble_BestOfFamily_1_AutoML_2_20251219_121607  0.731645   0.605793  0.712376                0.364793  0.457572  0.209372
GLM_1_AutoML_2_20251219_121607                           0.726277   0.609756  0.708473                0.352727  0.459524  0.211162
GBM_1_AutoML_2_20251219_121607                           0.698138   0.630039  0.671533                0.389657  0.469082  0.220038
GBM_3_AutoML_2_20251219_121607                           0.6




In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, f1_score, classification_report

import h2o
from h2o.automl import H2OAutoML

# -----------------------------
# A) Read TRAIN / TEST (TSV)
# -----------------------------
train_df = pd.read_csv("new_data_train_Yelp_Fake_Review.csv", sep="\t", engine="python")
test_df  = pd.read_csv("new_data_test_Yelp_Fake_Review.csv",  sep="\t", engine="python")

train_df = train_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)
test_df  = test_df[["reviewContent", "flagged"]].dropna().reset_index(drop=True)

train_df["reviewContent"] = train_df["reviewContent"].astype(str)
test_df["reviewContent"]  = test_df["reviewContent"].astype(str)
train_df["flagged"] = train_df["flagged"].astype(int)
test_df["flagged"]  = test_df["flagged"].astype(int)

print("Train labels:", np.unique(train_df["flagged"], return_counts=True))
print("Test labels :", np.unique(test_df["flagged"],  return_counts=True))

# -----------------------------
# B) Clean text
# -----------------------------
_url_re = re.compile(r"http\S+|www\.\S+")
_html_re = re.compile(r"<.*?>")
_space_re = re.compile(r"\s+")

def clean_text(s: str) -> str:
    s = str(s)
    s = s.replace("\t", " ").replace("\n", " ").replace("\r", " ")
    s = _html_re.sub(" ", s)
    s = _url_re.sub(" URL ", s)
    s = _space_re.sub(" ", s).strip()
    return s

train_df["text"] = train_df["reviewContent"].map(clean_text)
test_df["text"]  = test_df["reviewContent"].map(clean_text)

# -----------------------------
# C) Structured features
# -----------------------------
def add_text_features(df: pd.DataFrame, col: str) -> pd.DataFrame:
    text = df[col].fillna("").astype(str)

    df["char_len"] = text.str.len()
    df["word_count"] = text.str.split().map(len)
    df["exclaim_count"] = text.str.count("!")
    df["question_count"] = text.str.count(r"\?")
    df["upper_count"] = text.str.count(r"[A-Z]")
    df["digit_count"] = text.str.count(r"\d")

    df["upper_ratio"] = df["upper_count"] / (df["char_len"] + 1)
    df["digit_ratio"] = df["digit_count"] / (df["char_len"] + 1)

    df["multi_exclaim"] = text.str.contains(r"!!+").astype(int)
    df["multi_question"] = text.str.contains(r"\?\?+").astype(int)
    return df

train_df = add_text_features(train_df, "text")
test_df  = add_text_features(test_df,  "text")

struct_cols = [
    "char_len","word_count","exclaim_count","question_count",
    "upper_ratio","digit_ratio","multi_exclaim","multi_question"
]

for c in struct_cols:
    train_df[c] = pd.to_numeric(train_df[c], errors="coerce").fillna(0)
    test_df[c]  = pd.to_numeric(test_df[c], errors="coerce").fillna(0)

# -----------------------------
# D) TF-IDF -> SVD embeddings
# -----------------------------
tfidf = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=2,
    max_features=200000,
    sublinear_tf=True
)

X_tfidf_train = tfidf.fit_transform(train_df["text"])
X_tfidf_test  = tfidf.transform(test_df["text"])

svd_dim = 300
svd = TruncatedSVD(n_components=svd_dim, random_state=42)

X_svd_train = svd.fit_transform(X_tfidf_train)
X_svd_test  = svd.transform(X_tfidf_test)

svd_cols = [f"svd_{i}" for i in range(svd_dim)]
svd_train_df = pd.DataFrame(X_svd_train, columns=svd_cols)
svd_test_df  = pd.DataFrame(X_svd_test,  columns=svd_cols)

# -----------------------------
# E) Combine (multimodal fusion via features)
# -----------------------------
train_final = pd.concat(
    [train_df[struct_cols].reset_index(drop=True),
     svd_train_df.reset_index(drop=True),
     train_df[["flagged"]].reset_index(drop=True)],
    axis=1
)
test_final = pd.concat(
    [test_df[struct_cols].reset_index(drop=True),
     svd_test_df.reset_index(drop=True),
     test_df[["flagged"]].reset_index(drop=True)],
    axis=1
)

# -----------------------------
# F) H2O AutoML
# -----------------------------
h2o.init(max_mem_size="8G", nthreads=-1)

hf_train = h2o.H2OFrame(train_final)
hf_test  = h2o.H2OFrame(test_final)

hf_train["flagged"] = hf_train["flagged"].asfactor()
hf_test["flagged"]  = hf_test["flagged"].asfactor()

x_cols = [c for c in hf_train.columns if c != "flagged"]
y_col = "flagged"

aml = H2OAutoML(
    max_models=20,
    seed=42,
    sort_metric="AUC",
    nfolds=5
)
aml.train(x=x_cols, y=y_col, training_frame=hf_train)

print(aml.leaderboard.head())

# -----------------------------
# G) Evaluate
# -----------------------------
pred = aml.leader.predict(hf_test).as_data_frame()
y_pred = (pred["predict"].astype(str) == "1").astype(int).values
y_true = test_final["flagged"].astype(int).values

print("\nAccuracy:", accuracy_score(y_true, y_pred))
print("F1:", f1_score(y_true, y_pred))
print("\nReport:\n", classification_report(y_true, y_pred, digits=4))

# Save model
model_path = h2o.save_model(aml.leader, path="h2o_models_multimodal", force=True)
print("\n✅ Saved leader model to:", model_path)


Train labels: (array([0, 1]), array([4993, 4933]))
Test labels : (array([0, 1]), array([1212, 1271]))
Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,3 days 2 hours 5 mins
H2O_cluster_timezone:,Europe/Rome
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.9
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_michael_9wlyyc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.625 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
13:55:12.246: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

█
13:55:15.348: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

██
13:55:20.326: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

██
13:55:27.793: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

█
13:55:31.468: _train param, Dropping bad and constant columns: [question_count, exclaim_count, upper_ratio, multi_exclaim, multi_question]

███
13:55:57.75: _train param, Dropping bad and constant columns: [question_count

In [1]:
leader = aml.leader
print("Leader:", leader.model_id)
print("Algo:", leader.algo)


NameError: name 'aml' is not defined

In [4]:
import matplotlib.pyplot as plt

leader = aml.leader
base_models = leader.base_models
metalearner = leader.metalearner()

plt.figure(figsize=(12, 6))
plt.axis("off")

# Leader box
plt.text(0.5, 0.9, f"StackedEnsemble (Leader)\n{leader.model_id}",
         ha="center", va="center", bbox=dict(boxstyle="round", pad=0.5))

# Base models
y = 0.65
x_start = 0.1
x_step = 0.8 / max(1, min(len(base_models), 6))  # show first 6 if many
shown = base_models[:6]

for i, bm in enumerate(shown):
    x = x_start + i * x_step
    plt.text(x, y, f"Base model\n{bm}",
             ha="center", va="center", bbox=dict(boxstyle="round", pad=0.3))
    plt.plot([x, 0.5], [y-0.05, 0.78], linewidth=1)

if len(base_models) > 6:
    plt.text(0.5, 0.55, f"... + {len(base_models)-6} more base models ...",
             ha="center", va="center")

# Metalearner box
plt.text(0.5, 0.35, f"Metalearner\n{metalearner.algo.upper()}\n{metalearner.model_id}",
         ha="center", va="center", bbox=dict(boxstyle="round", pad=0.5))
plt.plot([0.5, 0.5], [0.78, 0.45], linewidth=2)

plt.text(0.5, 0.1, "Final Prediction (flagged 0/1)",
         ha="center", va="center", bbox=dict(boxstyle="round", pad=0.4))

plt.show()


NameError: name 'aml' is not defined

In [3]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, f1_score, classification_report

import h2o
from h2o.automl import H2OAutoML