In [1]:
# Cell 1: load dataframes and filter to the target book
import pandas as pd, numpy as np

df = pd.read_csv("books.csv")
reviews = pd.read_csv("reviews.csv")

target_asin = "000833739X"
book_reviews = reviews[
    (reviews["asin"].astype(str) == target_asin) |
    (reviews["parent_asin"].astype(str) == target_asin)
].copy().reset_index(drop=True)

print("Total reviews for book:", len(book_reviews))
book_reviews.head(3)


Total reviews for book: 2


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,date,year
0,5.0,Top Recommendation,perfect 2E01B6AA amazing experience,,000833739X,,,0,0,True,,0
1,5.0,Brilliant,Classic suspense from Alastair MacLean,[],000833739X,000833739X,AEQNV2PUYC4MB7EDSQHOV75SBMCA,1611019186345,0,True,2021-01-19 01:19:46.345,2021


In [2]:
# Cell 2: compute simple features and weak labels
book_reviews["text"] = book_reviews["text"].fillna("").astype(str)
book_reviews["text_len"] = book_reviews["text"].str.split().str.len()

super_words = ["best","amazing","perfect","awesome","incredible","fantastic","mustread","unreal"]
book_reviews["has_super"] = book_reviews["text"].str.lower().str.contains("|".join(super_words), na=False)

book_reviews["suspicious"] = (
    (book_reviews["rating"].astype(float) == 5.0) &
    (book_reviews["text_len"] < 20) &
    (book_reviews["has_super"])
).astype(int)

book_reviews["likely_genuine"] = (
    (book_reviews["rating"].astype(float) == 5.0) &
    (book_reviews["text_len"] >= 50)
).astype(int)

print("suspicious count:", int(book_reviews["suspicious"].sum()))
print("likely_genuine count:", int(book_reviews["likely_genuine"].sum()))

book_reviews[['rating','text_len','has_super','suspicious','likely_genuine']].head(6)


suspicious count: 1
likely_genuine count: 0


Unnamed: 0,rating,text_len,has_super,suspicious,likely_genuine
0,5.0,4,True,1,0
1,5.0,5,False,0,0


In [3]:
# Cell 3: vectorize and train
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

texts = book_reviews["text"].values
labels = book_reviews["suspicious"].values.copy()

# fallback: if only one class or too few positives
if labels.sum() == 0 or len(np.unique(labels)) == 1:
    print("Applying fallback suspicious labeling...")
    fallback_mask = (book_reviews["rating"].astype(float) == 5.0) & (book_reviews["text_len"] < 30)
    book_reviews.loc[fallback_mask, "suspicious"] = 1
    labels = book_reviews["suspicious"].values
    print("New suspicious count:", int(labels.sum()))

vec = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
X = vec.fit_transform(texts)

clf = LogisticRegression(max_iter=2000, class_weight='balanced')
clf.fit(X, labels)

joblib.dump((vec, clf), "suspicion_pipeline.joblib")
print("Model trained. Final suspicious positives:", int(labels.sum()))


Model trained. Final suspicious positives: 1


In [4]:
# Cell 4: score and select genuine reviews
vec, clf = joblib.load("suspicion_pipeline.joblib")
scores = clf.predict_proba(vec.transform(book_reviews["text"]))[:,1]
book_reviews["suspicion_score"] = scores

# With only 2 reviews, relax threshold aggressively
genuine = book_reviews[(book_reviews["suspicion_score"] < 0.8)].copy()

print("Genuine count:", len(genuine))
genuine = genuine.reset_index(drop=True)
genuine.head(5)


Genuine count: 2


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,date,year,text_len,has_super,suspicious,likely_genuine,suspicion_score
0,5.0,Top Recommendation,perfect 2E01B6AA amazing experience,,000833739X,,,0,0,True,,0,4,True,1,0,0.598938
1,5.0,Brilliant,Classic suspense from Alastair MacLean,[],000833739X,000833739X,AEQNV2PUYC4MB7EDSQHOV75SBMCA,1611019186345,0,True,2021-01-19 01:19:46.345,2021,5,False,0,0,0.401062


In [5]:
# Cell 5: SHAP for top negative features
import shap, numpy as np

Xg = vec.transform(genuine["text"].values)
bg = vec.transform(book_reviews["text"].values[:2])  # small background

explainer = shap.LinearExplainer(clf, bg, feature_perturbation="interventional")
shap_vals = explainer.shap_values(Xg)

mean_shap = np.mean(shap_vals, axis=0)
feature_names = vec.get_feature_names_out()

pairs = list(zip(feature_names, mean_shap))
pairs_sorted = sorted(pairs, key=lambda x: x[1])  # most negative first
top3 = pairs_sorted[:3]

print("Top 3 features:")
for token, val in top3:
    print(token, val)


Top 3 features:
2e01b6aa 0.0
2e01b6aa amazing 0.0
alastair 0.0


