In [None]:
# Reading the data and splitting columns

import pandas as pd
df = pd.read_csv(
    "Amazon_Reviews.csv",
    engine="python",
    on_bad_lines="warn"
)
df[['year_posted', 'month_posted', 'daytime_posted']] = df['Review Date'].str.split('-', expand=True)
df[['month_ex', 'day_ex','year_ex']] = df['Date of Experience'].str.split(' ', expand=True)
df[['review count', 'extra']] = df['Review Count'].str.split(' ', expand=True)
df['rating'] = df['Rating'].str.split().str[1]
df['rating'] = pd.to_numeric(df['rating'])
df['review count'] = pd.to_numeric(df['review count'])

In [None]:
# Cleaning data based on analysis plan

import numpy as np
import pandas as pd

# 1) Select relevant columns
clean_df = df[[
    'Country',
    'Review Title',
    'Review Text',
    'year_posted',
    'month_posted',
    'month_ex',
    'year_ex',
    'review count',
    'rating'
]].copy()

# 2) Ensure correct data types

clean_df["year_posted"] = pd.to_numeric(clean_df["year_posted"], errors="coerce")
clean_df["rating"] = pd.to_numeric(clean_df["rating"], errors="coerce")

# Drop rows where rating or year couldn't be parsed
clean_df = clean_df.dropna(subset=["year_posted", "rating"])

# 3) Filter to 2023–2024 and remove neutral ratings (3)

clean_df = clean_df[
    clean_df["year_posted"].isin([2023, 2024])
]

clean_df = clean_df[
    clean_df["rating"] != 3
]

# 4) Create binary target variable (high = 1, low = 0)
clean_df["rating_binary"] = (clean_df["rating"] >= 4).astype(int)

# 5) Drop rows with missing review text (for modeling)

model_df = clean_df.dropna(subset=["Review Text"]).copy()

model_df.head()


Unnamed: 0,Country,Review Title,Review Text,year_posted,month_posted,month_ex,year_ex,review count,rating,rating_group
0,US,A Store That Doesn't Want to Sell Anything,"I registered on the website, tried to order a ...",2024,09,September,2024,1.0,1.0,low
1,GB,Had multiple orders one turned up and…,Had multiple orders one turned up and driver h...,2024,09,September,2024,9.0,1.0,low
2,GB,I informed these reprobates,I informed these reprobates that I WOULD NOT B...,2024,09,September,2024,90.0,1.0,low
3,AU,Advertise one price then increase it on website,I have bought from Amazon before and no proble...,2024,09,September,2024,5.0,1.0,low
4,GB,If I could give a lower rate I would,If I could give a lower rate I would! I cancel...,2024,09,September,2024,8.0,1.0,low
...,...,...,...,...,...,...,...,...,...,...
12244,GB,Update on previous.,I just wanted to update on my previous review ...,2024,04,December,2020,8.0,1.0,low
17109,US,Amazon Selling Dangerous Product 12-22-2018,I have only written two reviews in the past. I...,2024,01,December,2018,18.0,2.0,low
17614,LT,"Had minor problems, but overall - great","I had some problems with them, such as fake se...",2023,12,August,2018,36.0,5.0,high
18465,US,Being flooded with far too many Chinese …,Amazon has become similar to a flea market of ...,2024,02,July,2017,6.0,1.0,low


In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
# Text + Sentiment + Metadata Logistic Regression (High vs Low)
# Goal: Predict if a review gives a "high" or "low" rating by:
#  - using TF-IDF features from review text
#  - adding VADER sentiment scores as extra numeric features
#  - using class_weight="balanced" to account for imbalance
#  - tuning the probability threshold for the "high" class

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score


# 1) Create VADER sentiment features
# VADER provides 4 scores:
#   compound (overall), pos, neu, neg
analyzer = SentimentIntensityAnalyzer()

def vader_scores(text: str) -> pd.Series:
    """Return VADER sentiment scores for a given text."""
    scores = analyzer.polarity_scores(str(text))
    return pd.Series(
        [scores["compound"], scores["pos"], scores["neu"], scores["neg"]],
        index=["sent_compound", "sent_pos", "sent_neu", "sent_neg"]
    )

# Apply VADER to each review text
model_df[["sent_compound", "sent_pos", "sent_neu", "sent_neg"]] = (
    model_df["Review Text"].apply(vader_scores)
)

# 2) Clean / engineer metadata features
# Convert to numeric (coerce invalid values to NaN)
model_df["month_posted"] = pd.to_numeric(model_df["month_posted"], errors="coerce")
model_df["review count"] = pd.to_numeric(model_df["review count"], errors="coerce")

# Fill missing numeric values with medians
for col in ["month_posted", "review count"]:
    model_df[col] = model_df[col].fillna(model_df[col].median())

# Reduce country cardinality:
# Keep top 5 most frequent countries and group the rest as "Other"
top_countries = model_df["Country"].value_counts().nlargest(5).index
model_df["Country_grouped"] = model_df["Country"].where(model_df["Country"].isin(top_countries), "Other")


# 3) Build feature matrix X and label y
# We feed:
#   - raw text into TF-IDF
#   - numeric features through StandardScaler
#   - categorical country through OneHotEncoder
feature_cols = [
    "Review Text",
    "Country_grouped",
    "review count",
    "month_posted",
    "sent_compound", "sent_pos", "sent_neu", "sent_neg"
]

X = model_df[feature_cols].copy()
y = model_df["rating_binary"].astype(int)


# 4) Train / test split (stratified)
# Stratify keeps the high/low proportion similar in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 5) Preprocessing + model pipeline
# ColumnTransformer lets us apply different preprocessing to different columns:
#  - TF-IDF on "Review Text"
#  - scaling for numeric features
#  - one-hot encoding for country
preprocess = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=8000,
            min_df=2
        ), "Review Text"),

        ("num", StandardScaler(), [
            "review count", "month_posted",
            "sent_compound", "sent_pos", "sent_neu", "sent_neg"
        ]),

        ("cat", OneHotEncoder(handle_unknown="ignore"), ["Country_grouped"]),
    ],
    remainder="drop"
)

# Logistic Regression:
# - class_weight="balanced" increases penalty on minority class errors
# - saga solver handles sparse TF-IDF well
clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("lr", LogisticRegression(
        max_iter=3000,
        solver="saga",
        class_weight="balanced",
        n_jobs=-1
    ))
])

# Train model
clf.fit(X_train, y_train)


# 6) Threshold tuning (optimize F1 for HIGH class)
# By default, predict() uses threshold = 0.50 for class 1.
# For imbalanced classification, adjusting the threshold can
# improve the "high" class metrics (precision/recall tradeoff).
probs = clf.predict_proba(X_test)[:, 1]  # probability of class 1 ("high")

thresholds = np.arange(0.20, 0.91, 0.01)
best_t = 0.50
best_f1 = -1.0

for t in thresholds:
    preds = (probs >= t).astype(int)
    f1_high = f1_score(y_test, preds, pos_label=1)
    if f1_high > best_f1:
        best_f1 = f1_high
        best_t = t

final_preds = (probs >= best_t).astype(int)

# 7) Evaluation
print(f"Best threshold for HIGH (by F1): {best_t:.2f}\n")

print("High precision:", precision_score(y_test, final_preds, pos_label=1))
print("High recall:   ", recall_score(y_test, final_preds, pos_label=1))
print("High F1:       ", f1_score(y_test, final_preds, pos_label=1))

print("\nClassification report:")
print(classification_report(y_test, final_preds, target_names=["low (0)", "high (1)"]))


# 8) Feature importance
import matplotlib.pyplot as plt

# Extract trained logistic regression
lr_model = clf.named_steps["lr"]

# Get numeric feature names (in order used in ColumnTransformer)
num_features = ["review count", "month_posted",
                "sent_compound", "sent_pos",
                "sent_neu", "sent_neg"]

# Get country dummy names
cat_features = clf.named_steps["preprocess"] \
    .named_transformers_["cat"] \
    .get_feature_names_out(["Country_grouped"])

# Combine only non-text features
selected_features = np.concatenate([num_features, cat_features])

# Extract coefficients corresponding ONLY to numeric + categorical
text_feature_count = len(
    clf.named_steps["preprocess"]
    .named_transformers_["text"]
    .get_feature_names_out()
)

# Coefficients for non-text features
coefs_non_text = lr_model.coef_[0][text_feature_count:]

importance_df = pd.DataFrame({
    "Feature": selected_features,
    "Coefficient": coefs_non_text,
    "Abs_Coefficient": np.abs(coefs_non_text)
}).sort_values("Abs_Coefficient", ascending=False)

importance_df


Best threshold for HIGH (by F1): 0.70

High precision: 0.8154761904761905
High recall:    0.7784090909090909
High F1:        0.7965116279069767

Classification report:
              precision    recall  f1-score   support

     low (0)       0.97      0.97      0.97      1139
    high (1)       0.82      0.78      0.80       176

    accuracy                           0.95      1315
   macro avg       0.89      0.88      0.88      1315
weighted avg       0.95      0.95      0.95      1315



Unnamed: 0,Feature,Coefficient,Abs_Coefficient
3,sent_pos,0.941839,0.941839
5,sent_neg,-0.894738,0.894738
6,Country_grouped_CA,-0.831768,0.831768
7,Country_grouped_GB,-0.826486,0.826486
11,Country_grouped_US,-0.770949,0.770949
9,Country_grouped_NL,-0.64926,0.64926
10,Country_grouped_Other,-0.288671,0.288671
2,sent_compound,0.253928,0.253928
4,sent_neu,-0.150877,0.150877
8,Country_grouped_IN,-0.105956,0.105956
