In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, mode
import gzip

class AmazonReviewAnalyzer:
    def __init__(self, file_paths, sample_size=1000, file_info=None):
        self.file_paths = file_paths
        self.sample_size = sample_size
        self.file_info = file_info or {}
        self.df_with_images = None

    def read_and_filter(self, file_path):
        try:
            df = pd.read_json(file_path, lines=True, compression='gzip')
        except (OSError, gzip.BadGzipFile, ValueError):
            try:
                df = pd.read_json(file_path, lines=True)
            except Exception as e:
                print(f"[ERROR] Failed to read {file_path}: {e}")
                return pd.DataFrame()
                
        df = df[df['image'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
        df = df[df['reviewText'].apply(lambda x: isinstance(x, str))]
        df['category'] = self.file_info[file_path]

        if len(df) > self.sample_size:
            df = df.sample(n=self.sample_size, random_state=42)

        return df

    def load_all_samples(self):
        all_samples = []
        for fp in self.file_paths:
            print(f"Reading file: {fp}")
            sample = self.read_and_filter(fp)
            if not sample.empty:
                all_samples.append(sample)
        if all_samples:
            self.df_with_images = pd.concat(all_samples, ignore_index=True)
        else:
            self.df_with_images = pd.DataFrame()
        print(f"Final shape: {self.df_with_images.shape}")
        print("Columns:", self.df_with_images.columns.tolist())
        print(self.df_with_images.head())
        return self.df_with_images

    def compute_rating_features(self, df=None):
        if df is None:
            df = self.df_with_images
        if df is None or 'overall' not in df.columns or df.empty:
            print("[ERROR] DataFrame is empty or missing 'overall' column.")
            return {}
        ratings = df['overall'].dropna()
        if ratings.empty:
            print("[WARNING] No ratings available.")
            return {}
        return {
            'mean': ratings.mean(),
            'variance': ratings.var(),
            'std_dev': ratings.std(),
            'min': ratings.min(),
            'max': ratings.max(),
            'median': ratings.median(),
            'mode': mode(ratings, keepdims=True).mode[0] if len(ratings) > 0 else np.nan,
            'skewness': skew(ratings),
            'kurtosis': kurtosis(ratings),
            'review_count': len(ratings)
        }

    def compute_features_by_category(self, category_col='category'):
        if self.df_with_images is None or self.df_with_images.empty:
            print("[ERROR] No data loaded.")
            return {}
        if category_col not in self.df_with_images.columns:
            print(f"[INFO] No '{category_col}' column found. Assigning by file order.")
            categories = [fp.split('/')[-1].split('_5')[0] for fp in self.file_paths]
            sizes = [len(self.read_and_filter(fp)) for fp in self.file_paths]
            cat_list = []
            for cat, size in zip(categories, sizes):
                cat_list.extend([cat] * size)
            self.df_with_images[category_col] = cat_list
        features_by_cat = {}
        for cat in self.df_with_images[category_col].unique():
            subdf = self.df_with_images[self.df_with_images[category_col] == cat]
            features_by_cat[cat] = self.compute_rating_features(subdf)
        return features_by_cat


In [3]:
file_paths = [
    "amazon_hackon/Data/Reviews with images/Cell_Phones_and_Accessories_5.json.gz",
    "amazon_hackon/Data/Reviews with images/Magazine_Subscriptions_5.json.gz",
    "amazon_hackon/Data/Reviews with images/Appliances_5 (1).json.gz",
    "amazon_hackon/Data/Reviews with images/All_Beauty_5 (1).json.gz",
    "amazon_hackon/Data/Reviews with images/AMAZON_FASHION_5 (1).json.gz"
]

FILE_INFO = {
    "amazon_hackon/Data/Reviews with images/Cell_Phones_and_Accessories_5.json.gz": "Cell_Phones_and_Accessories",
    "amazon_hackon/Data/Reviews with images/Magazine_Subscriptions_5.json.gz": "Magazine_Subscriptions",
    "amazon_hackon/Data/Reviews with images/Appliances_5 (1).json.gz": "Appliances",
    "amazon_hackon/Data/Reviews with images/All_Beauty_5 (1).json.gz": "All_Beauty",
    "amazon_hackon/Data/Reviews with images/AMAZON_FASHION_5 (1).json.gz": "AMAZON_FASHION"
}

analyzer = AmazonReviewAnalyzer(file_paths, sample_size=1000, file_info=FILE_INFO)
df_combined = analyzer.load_all_samples()

print("\n--- Overall Rating Statistics ---")
overall_features = analyzer.compute_rating_features()
for k, v in overall_features.items():
    print(f"{k}: {v}")

print("\n--- Per-Category Rating Statistics ---")
features_by_category = analyzer.compute_features_by_category()
for cat, feats in features_by_category.items():
    print(f"\nCategory: {cat}")
    for k, v in feats.items():
        print(f"  {k}: {v}")


Reading file: amazon_hackon/Data/Reviews with images/Cell_Phones_and_Accessories_5.json.gz


Reading file: amazon_hackon/Data/Reviews with images/Magazine_Subscriptions_5.json.gz
Reading file: amazon_hackon/Data/Reviews with images/Appliances_5 (1).json.gz
Reading file: amazon_hackon/Data/Reviews with images/All_Beauty_5 (1).json.gz
Reading file: amazon_hackon/Data/Reviews with images/AMAZON_FASHION_5 (1).json.gz
Final shape: (2025, 13)
Columns: ['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote', 'image', 'category']
   overall  verified   reviewTime      reviewerID        asin  \
0        5     False  01 21, 2016   AD44H3YP65YHL  B00L9ICE9M   
1        2      True  08 20, 2016  A2EAW9C31JE4L6  B00OS9E6AO   
2        5     False  11 30, 2015  A1DJ9ZJH1RKQIE  B017U7FQSG   
3        4     False   10 5, 2015  A2XOUBTCDPFMBB  B0126SWVPA   
4        5     False  08 16, 2016  A2PNMOCPBH09K8  B013OZ6J5C   

                                               style    reviewerName  \
0         {'Size:': ' i

In [4]:
stats_df = pd.DataFrame.from_dict(features_by_category, orient='index')
stats_df.index.name = 'category'
stats_df.reset_index(inplace=True)

print(stats_df)

                      category      mean  variance   std_dev  min  max  \
0  Cell_Phones_and_Accessories  4.031000  1.953993  1.397853    1    5   
1       Magazine_Subscriptions  5.000000       NaN       NaN    5    5   
2                   Appliances  4.748792  0.188330  0.433970    4    5   
3                   All_Beauty  4.602041  1.004944  1.002469    1    5   
4               AMAZON_FASHION  4.020408  2.989270  1.728951    1    5   

   median  mode  skewness  kurtosis  review_count  
0     5.0     5 -1.199197 -0.034661          1000  
1     5.0     5       NaN       NaN             1  
2     5.0     5 -1.147281 -0.683747           828  
3     5.0     5 -2.658339  6.184140            98  
4     5.0     5 -1.186447 -0.592342            98  


In [5]:
stats_df.to_pickle("Stats_Final_DF.pkl")