In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
!pip install fuzzywuzzy python-Levenshtein



In [5]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Added Naive Bayes
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz

# Download NLTK resources if not already downloaded
nltk.download("stopwords")
nltk.download("vader_lexicon")
nltk.download("punkt")
nltk.download("punkt_tab") # Added punkt_tab download

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
#Configuration
REVIEWS_PATH = "/content/drive/MyDrive/bigdata-all-beauty-amazon/first10kreviewsall_beauty.jsonl"
META_PATH = "/content/drive/MyDrive/bigdata-all-beauty-amazon/first10kmetaall_beauty.jsonl"
OUTPUT_DIR = "/content/drive/MyDrive/bigdata-all-beauty-amazon/results2-afterchangingnames/"
#REVIEWS_PATH = "/content/drive/MyDrive/bigdata-all-beauty-amazon/All_Beauty.jsonl"
#META_PATH = "/content/drive/MyDrive/bigdata-all-beauty-amazon/meta_All_Beauty.jsonl"
#OUTPUT_DIR = "/content/drive/MyDrive/bigdata-all-beauty-amazon/part_c_results/"


# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Data Loading
print("Loading data...")
reviews_df = pd.read_json(REVIEWS_PATH, lines=True)
meta_df = pd.read_json(META_PATH, lines=True)

#  Data Preprocessing and Merging
print("Preprocessing and merging data")

# Function to clean price, handling various formats and NaNs
def clean_price(price_val):
    if isinstance(price_val, (int, float)):
        return float(price_val)
    if isinstance(price_val, str):
        # Remove currency symbols, commas, and then convert to float
        cleaned_price = price_val.replace('$', '').replace(',', '')
        try:
            return float(cleaned_price)
        except ValueError:
            return np.nan
    return np.nan

meta_df["price_cleaned"] = meta_df["price"].apply(clean_price)
meta_df["price_cleaned"].fillna(meta_df["price_cleaned"].median(), inplace=True)

# Convert 'categories' column to string representation
# Assuming categories is a list of lists, take the first element of the first list
meta_df["categories_str"] = meta_df["categories"].apply(lambda x: str(x[0][0]) if isinstance(x, list) and len(x) > 0 and len(x[0]) > 0 else "Unknown")

# Merge dataframes on parent_asin
merged_df = pd.merge(reviews_df, meta_df, on="parent_asin", how="left", suffixes=("_review", "_meta"))

# Convert helpful_vote to numeric, handling non-numeric values by coercing to NaN
merged_df["helpful_vote"] = pd.to_numeric(merged_df["helpful_vote"], errors="coerce").fillna(0)

# Fill NaN prices in merged_df using the cleaned price column from meta_df
merged_df["price_cleaned"].fillna(merged_df["price_cleaned"].median(), inplace=True)

# Drop rows where essential columns are missing for analysis
merged_df.dropna(subset=["rating", "text"], inplace=True)

# Feature engineering
print("Performing feature engineering.")

# 1. Sentiment Analysis
sia = SentimentIntensityAnalyzer()
merged_df["sentiment_score"] = merged_df["text"].apply(lambda x: sia.polarity_scores(x)["compound"])

# 2. Price Anomaly Detection (using IQR for robustness)
def detect_price_anomaly(df):
    if len(df) < 2: # Ensure there are enough data points for IQR calculation
        df["price_anomaly"] = 0
        return df
    Q1 = df["price_cleaned"].quantile(0.25)
    Q3 = df["price_cleaned"].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df["price_anomaly"] = ((df["price_cleaned"] < lower_bound) | (df["price_cleaned"] > upper_bound)).astype(int)
    return df

# Apply price anomaly detection globally if 'categories_str' is problematic or has too many unique values
if 'categories_str' in merged_df.columns and merged_df['categories_str'].nunique() > 1:
    try:
        merged_df = merged_df.groupby("categories_str", group_keys=False).apply(detect_price_anomaly)
    except Exception as e:
        print(f"Warning: Groupby by categories_str failed ({e}). Applying price anomaly detection globally.")
        merged_df = detect_price_anomaly(merged_df)
else:
    print("Warning: 'categories_str' column not suitable for groupby. Applying price anomaly detection globally.")
    merged_df = detect_price_anomaly(merged_df)

merged_df["price_anomaly"].fillna(0, inplace=True) # Fill NaN for categories with too few data points or if applied globally

# 3. Brand Consistency Check (NLP-based)
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text) # Remove punctuation
    tokens = word_tokenize(text)
    return " ".join([word for word in tokens if word not in stop_words])

merged_df["cleaned_title_meta"] = merged_df["title_meta"].apply(clean_text)
merged_df["cleaned_store"] = merged_df["store"].apply(clean_text)

def check_brand_consistency(row):
    title_meta = str(row["cleaned_title_meta"])
    store = str(row["cleaned_store"])

    if not store or store == "nan": # Handle empty or NaN store names
        return 0 # Assume consistent if no store info

    # Check if store name is present in product title (fuzzy matching)
    if fuzz.partial_ratio(store, title_meta) > 80: # Threshold for fuzzy matching
        return 0 # Consistent
    return 1 # Inconsistent

merged_df["brand_inconsistency"] = merged_df.apply(check_brand_consistency, axis=1)

# 4. Feature/Description Consistency
def check_feature_description_consistency(row):
    features = str(row["features"])
    description = str(row["description"])

    if not features or features == "nan" or not description or description == "nan":
        return 0 # Assume consistent if missing info

    # Simple check: if common contradictory terms are present in both
    # This is a simplified example, a more robust solution would involve NLP models
    contradictory_pairs = [
        ("waterproof", "not waterproof"),
        ("durable", "fragile"),
        ("new", "used"),
        ("genuine", "fake"),
        ("authentic", "replica")
    ]

    for pair in contradictory_pairs:
        if pair[0] in description.lower() and pair[1] in description.lower():
            return 1 # Inconsistent
        if pair[0] in features.lower() and pair[1] in features.lower():
            return 1 # Inconsistent
    return 0 # Consistent

merged_df["feature_description_inconsistency"] = merged_df.apply(check_feature_description_consistency, axis=1)

# 5. Genuine Score (Target Variable)
# A lower genuine score indicates higher suspicion
# We will define "suspicious" as a binary target for classification models
# For simplicity, let's define suspicious based on a combination of flags

# Create a binary target variable: 1 for suspicious, 0 for genuine
# Introduce noise to the target variable to make it less perfectly separable
np.random.seed(42) # for reproducibility

# Initial suspicious flag based on existing rules
merged_df["initial_suspicious"] = (
    (merged_df["price_anomaly"] == 1) |
    (merged_df["brand_inconsistency"] == 1) |
    (merged_df["feature_description_inconsistency"] == 1)
).astype(int)

# Introduce noise: flip some labels based on a probability
# For example, flip 10% of the labels
flip_probability = 0.10

merged_df["is_suspicious"] = merged_df["initial_suspicious"].apply(
    lambda x: 1 - x if np.random.rand() < flip_probability else x
)

# Add review length as a feature
merged_df["review_length"] = merged_df["text"].apply(len)

# Select features for models
features = [
    "rating",
    "helpful_vote",
    "sentiment_score",
    "price_cleaned",
    "price_anomaly",
    "brand_inconsistency",
    "feature_description_inconsistency",
    "review_length",
    "verified_purchase"
]

X = merged_df[features].copy()
y = merged_df["is_suspicious"]

# Handle missing values in features (e.g., fill with median or mean)
X["price_cleaned"].fillna(X["price_cleaned"].median(), inplace=True)
X["helpful_vote"].fillna(X["helpful_vote"].median(), inplace=True)
X["review_length"].fillna(X["review_length"].median(), inplace=True)
X["verified_purchase"] = X["verified_purchase"].astype(int) # Ensure it's numeric

# Scale numerical features
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[["rating", "helpful_vote", "sentiment_score", "price_cleaned", "review_length"]] = scaler.fit_transform(X_scaled[["rating", "helpful_vote", "sentiment_score", "price_cleaned", "review_length"]])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# --- Model Training and Evaluation with Hyperparameter Tuning ---
print("Training and evaluating models with hyperparameter tuning...")

# Define models and their parameter grids for GridSearchCV
models_and_params = {
    "Logistic Regression": {
        "model": LogisticRegression(random_state=42, solver='liblinear'),
        "params": {
            "C": [0.1, 1.0, 10.0],
            "penalty": ['l1', 'l2']
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2]
        }
    },
    "Support Vector Machine": {
        "model": SVC(probability=True, random_state=42),
        "params": {
            "C": [0.1, 1.0, 10.0],
            "kernel": ["linear", "rbf"]
        }
    },
    "K-Nearest Neighbors": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"]
        }
    },
    "Naive Bayes": {
        "model": GaussianNB(),
        "params": {}
    }
}

results = {}
roc_curves = {}

plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)

for name, mp in models_and_params.items():
    print(f"\n--- Tuning and Training {name} ---")

    # Default (bad) configuration evaluation
    default_model = mp["model"]
    default_model.fit(X_train, y_train)
    y_pred_default = default_model.predict(X_test)
    y_proba_default = default_model.predict_proba(X_test)[:, 1]

    results[f"{name} (Default)"] = {
        "accuracy": accuracy_score(y_test, y_pred_default),
        "precision_0": precision_score(y_test, y_pred_default, pos_label=0),
        "recall_0": recall_score(y_test, y_pred_default, pos_label=0),
        "f1_0": f1_score(y_test, y_pred_default, pos_label=0),
        "precision_1": precision_score(y_test, y_pred_default, pos_label=1),
        "recall_1": recall_score(y_test, y_pred_default, pos_label=1),
        "f1_1": f1_score(y_test, y_pred_default, pos_label=1),
        "roc_auc": roc_auc_score(y_test, y_proba_default)
    }

    # Hyperparameter tuning with GridSearchCV
    if mp["params"]:
        grid_search = GridSearchCV(mp["model"], mp["params"], cv=3, scoring='f1', n_jobs=-1, verbose=0)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best parameters for {name}: {grid_search.best_params_}")
    else:
        best_model = mp["model"] # No tuning for Naive Bayes for now

    # Optimal configuration evaluation
    y_pred_optimal = best_model.predict(X_test)
    y_proba_optimal = best_model.predict_proba(X_test)[:, 1]

    results[f"{name} (optimal)"] = {
        "accuracy": accuracy_score(y_test, y_pred_optimal),
        "precision_0": precision_score(y_test, y_pred_optimal, pos_label=0),
        "recall_0": recall_score(y_test, y_pred_optimal, pos_label=0),
        "f1_0": f1_score(y_test, y_pred_optimal, pos_label=0),
        "precision_1": precision_score(y_test, y_pred_optimal, pos_label=1),
        "recall_1": recall_score(y_test, y_pred_optimal, pos_label=1),
        "f1_1": f1_score(y_test, y_pred_optimal, pos_label=1),
        "roc_auc": roc_auc_score(y_test, y_proba_optimal)
    }

    # ROC Curve for Optimal Model
    fpr, tpr, _ = roc_curve(y_test, y_proba_optimal)
    roc_auc = auc(fpr, tpr)
    roc_curves[name] = (fpr, tpr, roc_auc)
    plt.plot(fpr, tpr, lw=2, alpha=.8, label=f'{name} (AUC = {roc_auc:.2f})')

    # Confusion Matrix for Optimal Model
    cm = confusion_matrix(y_test, y_pred_optimal)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=["genuine", "suspicious"], yticklabels=["genuine", "suspicious"])
    plt.title(f'{name} confusion matrix (optimal)')
    plt.xlabel('predicted')
    plt.ylabel('actual')
    plt.savefig(os.path.join(OUTPUT_DIR, f'{name.replace(" ", "_")}_confusion_matrix_optimal.png'))
    plt.show()
    plt.close()

    # Confusion Matrix for Default Model
    cm_default = confusion_matrix(y_test, y_pred_default)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_default, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=["genuine", "suspicious"], yticklabels=["genuine", "suspicious"])
    plt.title(f'{name} confusion matrix (default)')
    plt.xlabel('predicted')
    plt.ylabel('actual')
    plt.savefig(os.path.join(OUTPUT_DIR, f'{name.replace(" ", "_")}_confusion_matrix_default.png'))
    plt.show()
    plt.close()

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Receiver operating characteristic (ROC) curves (optimal models)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
plt.savefig(os.path.join(OUTPUT_DIR, 'roc_curves_comparison_optimal.png'))
plt.close()

print("\n--- Model Performance Summary ---")
with open(os.path.join(OUTPUT_DIR, "model_performance_summary_with_tuning.txt"), "w") as f:
    for name, metrics in results.items():
        print(f"\n{name}:\n")
        f.write(f"\n{name}:\n")
        for metric_name, value in metrics.items():
            metric_display = metric_name.replace("_", " ").title()
            print(f"  {metric_display}: {value:.4f}")
            f.write(f"  {metric_display}: {value:.4f}\n")

print("Script finished. Results and visualizations saved to", OUTPUT_DIR)

Loading data...
Preprocessing and merging data...
Performing feature engineering...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  meta_df["price_cleaned"].fillna(meta_df["price_cleaned"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["price_cleaned"].fillna(merged_df["price_cleaned"].median(), inplace=True)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["price_anomaly"].fillna(0, inplace=True) # Fill NaN for categories with too few data points or if applied globally


Training and evaluating models with hyperparameter tuning...

--- Tuning and Training Logistic Regression ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["price_cleaned"].fillna(X["price_cleaned"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["helpful_vote"].fillna(X["helpful_vote"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

Best parameters for Logistic Regression: {'C': 10.0, 'penalty': 'l1'}

--- Tuning and Training Random Forest ---
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 200}

--- Tuning and Training Gradient Boosting ---
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'n_estimators': 50}

--- Tuning and Training Support Vector Machine ---
Best parameters for Support Vector Machine: {'C': 1.0, 'kernel': 'linear'}

--- Tuning and Training K-Nearest Neighbors ---
Best parameters for K-Nearest Neighbors: {'n_neighbors': 5, 'weights': 'uniform'}

--- Tuning and Training Naive Bayes ---

--- Model Performance Summary ---

Logistic Regression (Default):

  Accuracy: 0.8910
  Precision 0: 0.8915
  Recall 0: 0.9913
  F1 0: 0.9387
  Precision 1: 0.8842
  Recall 1: 0.3552
  F1 1: 0.5068
  Roc Auc: 0.6559

Logistic Regression (optimal):

  Accuracy: 0.8917
  Precision 0: 0.8921
  Recall 0: 0.9913
  F1 0: 0.9391
  Precision 1: 0.8854
  Recall 1: 0.3594
  F1 1: 0.5113
  Ro