# Implementation of Basic Statistics

In [None]:
# 📦 Imports
import json
import pandas as pd

file_path = '/Users/santiornsan/UserIntelligence/data/raw/luxury_beauty_reviews.json'

# Load JSON Lines with safety
data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data.append(json.loads(line.strip()))
        except json.JSONDecodeError:
            continue  # Skip malformed lines

df = pd.DataFrame(data)

# ✅ Clean and prep the data
required_columns = ['asin', 'overall', 'verified', 'reviewTime', 'reviewText']
df = df[[col for col in required_columns if col in df.columns]]
df = df.dropna(subset=['asin', 'overall', 'reviewText', 'reviewTime'])
df['reviewTime'] = pd.to_datetime(df['reviewTime'], errors='coerce')
df = df[df['reviewTime'].notnull()]
df['verified'] = df['verified'].astype(bool)
df['reviewLength'] = df['reviewText'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

print(f"✅ Cleaned and loaded {len(df)} reviews.")
df.head()

# 📊 Define Stats Function
def get_basic_stats(group):
    total_reviews = len(group)
    avg_rating = group['overall'].mean()
    
    weights = group['verified'].apply(lambda x: 2 if x else 1)
    avg_rating_weighted = (group['overall'] * weights).sum() / weights.sum()
    
    percent_verified = group['verified'].mean()
    rating_dist = {int(k): v for k, v in group['overall'].value_counts().sort_index().to_dict().items()}
    avg_review_length = group['reviewLength'].mean()

    if group['reviewTime'].notnull().any():
        month_year = group['reviewTime'].dt.to_period("M").astype(str)
        most_common_month = Counter(month_year).most_common(1)[0][0]
    else:
        most_common_month = None

    return pd.Series({
        'total_reviews': total_reviews,
        'avg_rating': round(avg_rating, 2),
        'avg_rating_weighted': round(avg_rating_weighted, 2),
        'percent_verified': round(percent_verified, 2),
        'rating_dist': rating_dist,
        'avg_review_length': round(avg_review_length, 2),
        'most_common_review_month': most_common_month
    })

# 🔁 Compute Stats Per Product
grouped = df.groupby('asin')
basic_stats_df = grouped.apply(get_basic_stats).reset_index()

# 📤 Convert to JSON (records = list of dicts)
basic_stats_json = basic_stats_df.to_dict(orient='records')

# 👁️ Optional: Preview the first result
from pprint import pprint
pprint(basic_stats_json[0])

✅ Cleaned and loaded 34264 reviews.
{'asin': 'B00004U9V2',
 'avg_rating': 4.75,
 'avg_rating_weighted': 4.76,
 'avg_review_length': 22.0,
 'most_common_review_month': '2017-02',
 'percent_verified': 0.88,
 'rating_dist': {3: 2, 4: 2, 5: 20},
 'total_reviews': 24}


  basic_stats_df = grouped.apply(get_basic_stats).reset_index()
