
# 01 — EDA (Exploratory Data Analysis)

**Project:** Google Reviews NLP  
**Goal:** Explore the raw dataset to understand data quality, distributions, and guide cleaning/splitting decisions.

> Tip: Run this notebook from the project root's virtual environment. Keep outputs light before committing.


In [None]:

# Imports & setup
import os
from pathlib import Path
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Display options
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 20)

# Ensure plots show inline (typical in Jupyter)
%matplotlib inline

# Paths (adjust if needed)
PROJECT_ROOT = Path.cwd()  # if running from the repo root
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW DATA PATH:", DATA_RAW)
print("PROCESSED PATH:", DATA_PROCESSED)


## Load Data

In [None]:

from glob import glob

def robust_read_csv(path: Path, sep=",", encoding="utf-8", **kwargs) -> pd.DataFrame:
    """Read CSV with common fallbacks for encoding/separators."""
    try:
        return pd.read_csv(path, sep=sep, encoding=encoding, **kwargs)
    except UnicodeDecodeError:
        return pd.read_csv(path, sep=sep, encoding="latin-1", **kwargs)
    except pd.errors.ParserError:
        # try semicolon as fallback
        return pd.read_csv(path, sep=";", encoding=encoding, **kwargs)

def load_all_raw(data_dir: Path) -> pd.DataFrame:
    csvs = sorted(list(data_dir.glob("*.csv")))
    if not csvs:
        print(f"[WARN] No CSVs found in {data_dir}. Place raw files there.")
        return pd.DataFrame()
    frames = []
    for p in csvs:
        try:
            df = robust_read_csv(p)
            df["__source_file"] = p.name
            frames.append(df)
        except Exception as e:
            print(f"[WARN] Failed to read {p}: {e}")
    out = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    return out

df = load_all_raw(DATA_RAW)
print("Rows:", len(df))
df.head(10)


## Quick Inspection

In [None]:

print("dtypes:\n", df.dtypes)
print("\nNulls per column:\n", df.isna().sum())

# Numeric/overall describe
display(df.describe(include="all").transpose().head(30))


## Light Normalization (in-notebook helpers)

In [None]:

# Expected columns (adjust to your scraper schema):
# name | rating | number_of_photos | message
expected_cols = ["name", "rating", "number_of_photos", "message"]
for c in expected_cols:
    if c not in df.columns:
        print(f"[WARN] Column missing: {c}")

# Safe copies / coercions
df["name"] = df.get("name", pd.Series([None]*len(df))).fillna("").astype(str).str.strip()
df["message"] = df.get("message", pd.Series([None]*len(df))).fillna("").astype(str).str.strip()

# rating: try to coerce; keep original as 'rating_raw'
if "rating" in df.columns:
    df["rating_raw"] = df["rating"]
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
else:
    df["rating"] = np.nan

# photos: normalize to string "0" when empty/non-numeric (to match your test expectations)
if "number_of_photos" in df.columns:
    photos = df["number_of_photos"].fillna("").astype(str).str.strip()
    photos = photos.mask(photos.eq(""), "0")
    photos = photos.str.replace(r"\.0+$", "", regex=True)
    photos = photos.where(photos.str.fullmatch(r"\d+"), "0")
    df["number_of_photos_norm"] = photos
else:
    df["number_of_photos_norm"] = "0"

# Basic sanity
display(df.head(10)[["name","rating","number_of_photos","number_of_photos_norm","message"] if "number_of_photos" in df.columns else ["name","rating","number_of_photos_norm","message"]])


## Rating Distribution

In [None]:

if df["rating"].notna().any():
    vc = df["rating"].value_counts(dropna=True).sort_index()
    print(vc)

    plt.figure()
    vc.plot(kind="bar")
    plt.title("Rating Distribution")
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.show()
else:
    print("[INFO] No numeric ratings found.")


## Message Length & Text Stats

In [None]:

def word_count(s: str) -> int:
    if not isinstance(s, str):
        return 0
    # simple token split (space)
    return len([t for t in s.split() if t.strip()])

df["len_chars"] = df["message"].apply(lambda s: len(s) if isinstance(s, str) else 0)
df["len_words"] = df["message"].apply(word_count)
df["has_url"] = df["message"].str.contains(r"http[s]?://", case=False, regex=True, na=False)

display(df[["name","rating","len_chars","len_words","has_url"]].head(10))

plt.figure()
df["len_words"].hist(bins=30)
plt.title("Message Length (words)")
plt.xlabel("Words")
plt.ylabel("Count")
plt.show()

if df["rating"].notna().any():
    # Boxplot len_words by rating
    plt.figure()
    grouped = [g["len_words"].dropna().values for _, g in df.groupby(df["rating"])]
    labels = [str(int(k)) if not np.isnan(k) else "NaN" for k, _ in df.groupby(df["rating"])]
    plt.boxplot(grouped, labels=labels, showfliers=False)
    plt.title("Message Length by Rating")
    plt.xlabel("Rating")
    plt.ylabel("Words")
    plt.show()


## Authors & Potential Spam Signals

In [None]:

if "name" in df.columns:
    author_counts = df["name"].value_counts().head(20)
    print("Top authors by number of reviews:")
    print(author_counts)

    # Suspects: many reviews or ultra-short messages
    suspects = df[(df["len_words"] <= 3) | (df["has_url"])]
    print("\nSuspect rows (first 10):")
    display(suspects.head(10)[["name","rating","len_words","has_url","message"]])
else:
    print("[INFO] 'name' column not available.")


## Duplicate Checks

In [None]:

# Simple dupe check by exact message (can be expanded later)
if not df.empty:
    dup_by_msg = df[df.duplicated(subset=["message"], keep=False)].sort_values("message").head(20)
    print("Potential duplicate messages (first 20):")
    display(dup_by_msg[["name","rating","message"]])
else:
    print("[INFO] DataFrame is empty.")


## Save a Lightweight Processed Preview (Optional)

In [None]:

PREVIEW_PATH = DATA_PROCESSED / "reviews_preview.csv"
cols = ["name","rating","number_of_photos_norm","len_chars","len_words","has_url","message"]
existing_cols = [c for c in cols if c in df.columns]
if not df.empty and existing_cols:
    os.makedirs(DATA_PROCESSED, exist_ok=True)
    df[existing_cols].to_csv(PREVIEW_PATH, index=False)
    print(f"Saved preview -> {PREVIEW_PATH}")
else:
    print("[INFO] Nothing to save (no data or columns missing).")



## Conclusions & Next Steps

- Validate rating distribution and class balance.
- Confirm text cleaning rules (e.g., empty → drop? min length threshold?).
- Standardize `number_of_photos` as **string "0"** when empty to satisfy unit tests.
- Inspect authors with many reviews for potential spam/bots.
- Decide language handling (Portuguese vs. others) if needed.
- Proceed to `src/data/preprocess.py` (production-grade cleaning) and `src/data/split.py` (stratified + grouped split).

> Tip: Keep this notebook focused on **exploration**. Production logic should live in `src/` and be covered by tests in `tests/`.
