# Implementation of Basic Statistics

In [None]:
# 📦 Imports
import json
import pandas as pd
from collections import Counter

file_path = '/Users/santiornsan/UserIntelligence/data/raw/luxury_beauty_reviews.json'

# Load JSON Lines with safety
data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data.append(json.loads(line.strip()))
        except json.JSONDecodeError:
            continue  # Skip malformed lines

df = pd.DataFrame(data)

# ✅ Clean and prep the data
required_columns = ['asin', 'overall', 'verified', 'reviewTime', 'reviewText']
df = df[[col for col in required_columns if col in df.columns]]
df = df.dropna(subset=['asin', 'overall', 'reviewText', 'reviewTime'])
df['reviewTime'] = pd.to_datetime(df['reviewTime'], errors='coerce')
df = df[df['reviewTime'].notnull()]
df['verified'] = df['verified'].astype(bool)
df['reviewLength'] = df['reviewText'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

print(f"✅ Cleaned and loaded {len(df)} reviews.")
df.head()

# 📊 Define Stats Function
def get_basic_stats(group):
    total_reviews = len(group)
    avg_rating = group['overall'].mean()
    
    weights = group['verified'].apply(lambda x: 2 if x else 1)
    avg_rating_weighted = (group['overall'] * weights).sum() / weights.sum()
    
    percent_verified = group['verified'].mean()
    rating_dist = {int(k): v for k, v in group['overall'].value_counts().sort_index().to_dict().items()}
    avg_review_length = group['reviewLength'].mean()

    if group['reviewTime'].notnull().any():
        month_year = group['reviewTime'].dt.to_period("M").astype(str)
        most_common_month = Counter(month_year).most_common(1)[0][0]
    else:
        most_common_month = None

    return pd.Series({
        'total_reviews': total_reviews,
        'avg_rating': round(avg_rating, 2),
        'avg_rating_weighted': round(avg_rating_weighted, 2),
        'percent_verified': round(percent_verified, 2),
        'rating_dist': rating_dist,
        'avg_review_length': round(avg_review_length, 2),
        'most_common_review_month': most_common_month
    })

# 🔁 Compute Stats Per Product
grouped = df.groupby('asin')
basic_stats_df = grouped.apply(get_basic_stats).reset_index()

# 📤 Convert to JSON (records = list of dicts)
basic_stats_json = basic_stats_df.to_dict(orient='records')

# 👁️ Optional: Preview the first result
from pprint import pprint
pprint(basic_stats_json[0])


In [7]:
# src/analytics/group_metrics.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple


def temporal_grouping(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Aggregates review data by year, month, and month-year.
    Prints key metrics and returns three DataFrames.
    """
    df['review_year'] = df['reviewTime'].dt.year
    df['review_month'] = df['reviewTime'].dt.month
    df['review_month_year'] = df['reviewTime'].dt.to_period('M').astype(str)

    agg_funcs = {
        'overall': 'mean',
        'verified': 'mean',
        'reviewLength': 'mean',
        'asin': 'count'
    }

    reviews_by_year = df.groupby('review_year').agg(agg_funcs).rename(columns={'asin': 'review_count'})
    reviews_by_month = df.groupby('review_month').agg(agg_funcs).rename(columns={'asin': 'review_count'})
    reviews_by_month_year = df.groupby('review_month_year').agg(agg_funcs).rename(columns={'asin': 'review_count'})

    print("\n📆 Temporal Grouping Completed:")
    print("Years:\n", reviews_by_year.head(), "\n")
    print("Months:\n", reviews_by_month.head(), "\n")
    print("Month-Year:\n", reviews_by_month_year.head(), "\n")

    return reviews_by_year, reviews_by_month, reviews_by_month_year


def plot_temporal_grouping(month_year_df: pd.DataFrame):
    """
    Plots review count, average rating, and % verified over time (by month-year).
    """
    print("\n📊 Generating temporal trend plots...")
    plt.figure(figsize=(16, 12))

    plt.subplot(3, 1, 1)
    sns.lineplot(x=month_year_df.index, y=month_year_df['review_count'])
    plt.title('📈 Total Reviews Over Time')
    plt.ylabel('Review Count')
    plt.xticks(rotation=45)

    plt.subplot(3, 1, 2)
    sns.lineplot(x=month_year_df.index, y=month_year_df['overall'])
    plt.title('⭐ Average Rating Over Time')
    plt.ylabel('Average Rating')
    plt.xticks(rotation=45)

    plt.subplot(3, 1, 3)
    sns.lineplot(x=month_year_df.index, y=month_year_df['verified'])
    plt.title('🔒 % Verified Purchases Over Time')
    plt.ylabel('% Verified')
    plt.xlabel('Month-Year')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()


def reviewer_segmentation(df: pd.DataFrame, divergence_threshold: float = 1.0) -> pd.DataFrame:
    """
    Compares each reviewer's average rating to the average rating of products they've reviewed.
    Prints summary counts and flags 'divergent' reviewers.
    """
    product_avg = df.groupby('asin')['overall'].mean().rename('product_avg')
    df = df.join(product_avg, on='asin')

    df['rating_diff'] = df['overall'] - df['product_avg']
    reviewer_diff = df.groupby('reviewerID')['rating_diff'].mean().rename('avg_rating_diff')

    reviewer_seg = reviewer_diff.to_frame()
    reviewer_seg['classification'] = reviewer_seg['avg_rating_diff'].apply(
        lambda x: 'divergent' if abs(x) > divergence_threshold else 'aligned'
    )

    print("\n🧑‍⚖️ Reviewer Segmentation Summary:")
    print(reviewer_seg['classification'].value_counts(), "\n")
    print(reviewer_seg.head())

    return reviewer_seg.reset_index()


def plot_reviewer_segmentation(reviewer_df: pd.DataFrame):
    """
    Plots reviewer classification and distribution of average rating difference.
    """
    print("\n📊 Plotting reviewer segmentation results...")
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    sns.countplot(x='classification', data=reviewer_df)
    plt.title('👥 Reviewer Classification')
    plt.ylabel('Number of Reviewers')

    plt.subplot(1, 2, 2)
    sns.histplot(data=reviewer_df, x='avg_rating_diff', bins=30, kde=True)
    plt.title('📊 Avg Rating Difference per Reviewer')
    plt.xlabel('Average Rating Difference')

    plt.tight_layout()
    plt.show()