In [4]:
import pandas as pd
import numpy as np
import os
import gzip

WINDOW_SIZE = 30
SAMPLE_SIZE = 1000
# PATHSTR = "/teamspace/studios/this_studio/amazon_hackon/Data/Reviews with images/"

FILE_INFO = {
    "amazon_hackon/Data/Reviews with images/Cell_Phones_and_Accessories_5.json.gz": "Cell_Phones_and_Accessories",
    "amazon_hackon/Data/Reviews with images/Magazine_Subscriptions_5.json.gz": "Magazine_Subscriptions",
    "amazon_hackon/Data/Reviews with images/Appliances_5 (1).json.gz": "Appliances",
    "amazon_hackon/Data/Reviews with images/All_Beauty_5 (1).json.gz": "All_Beauty",
    "amazon_hackon/Data/Reviews with images/AMAZON_FASHION_5 (1).json.gz": "AMAZON_FASHION"
}

# --- Data Loading (No Changes) ---
def read_and_filter(file_path, category, sample_size=1000):
    print(f"-> Reading {os.path.basename(file_path)}...")
    try:
        compression = 'gzip' if file_path.endswith('.gz') else 'infer'
        df = pd.read_json(file_path, lines=True, compression=compression)
        df = df[df['image'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

    except Exception as e:
        print(f"[ERROR] Failed to read {file_path}: {e}")
        return pd.DataFrame()

    if 'reviewText' in df.columns:
            df = df[df['reviewText'].apply(lambda x: isinstance(x, str))]
    else:
        print(f"[WARNING] No 'reviewText' column in {file_path}")
        return pd.DataFrame()
    df['category'] = category

    if len(df) > SAMPLE_SIZE:
            df = df.sample(n=SAMPLE_SIZE, random_state=42)
    return df

# --- Feature Engineering Helper Functions (No Changes) ---
def _get_slope(series):
    if len(series) < 2: return np.nan
    return np.polyfit(np.arange(len(series)), series, 1)[0]

def _pos_neg_ratio(series):
    pos = np.sum(series >= 4)
    neg = np.sum(series <= 2)
    if neg == 0: return np.nan
    return pos / neg

# --- MODIFIED: Temporal Feature Creation (Reviewer Features Removed) ---
def create_temporal_features(df, window_size):
    """
    Generates temporal features for products and categories only.
    All reviewer-level calculations have been removed.
    """
    print(f"\nGenerating temporal features with window size: {window_size} (Product & Category only)...")
    
    # Sort by product and category time, removing reviewerID
    df = df.sort_values(by=['asin', 'category', 'unixReviewTime']).reset_index(drop=True)
    df['time_in_days'] = df['unixReviewTime'] / (24 * 3600)

    # Group-wise Rolling Objects
    product_rolling = df.groupby('asin').rolling(window=window_size, min_periods=1)
    category_rolling = df.groupby('category').rolling(window=window_size, min_periods=1)

    # --- Feature Creation (Product-Level) ---
    print("Calculating product-level features...")
    time_since_last_review = df.groupby('asin')['time_in_days'].diff()
    df['review_arrival_rate'] = 1 / time_since_last_review
    
    prod_stats = product_rolling['overall'].agg(['mean', 'std'])
    df['product_rolling_mean_rating'] = prod_stats['mean'].reset_index(level=0, drop=True)
    df['product_rolling_std_rating'] = prod_stats['std'].reset_index(level=0, drop=True)
    
    df['product_rating_trend'] = product_rolling['overall'].apply(_get_slope, raw=False).reset_index(level=0, drop=True)
    df['product_pos_neg_ratio'] = product_rolling['overall'].apply(_pos_neg_ratio, raw=True).reset_index(level=0, drop=True)
    df['product_cumulative_reviews'] = df.groupby('asin').cumcount() + 1
    
    # --- Feature Creation (Category-Level) ---
    print("Calculating category-level features...")
    cat_stats = category_rolling['overall'].agg(['mean', 'std'])
    df['category_rolling_mean_rating'] = cat_stats['mean'].reset_index(level=0, drop=True)
    df['category_rolling_std_rating'] = cat_stats['std'].reset_index(level=0, drop=True)
    df['category_rating_trend'] = category_rolling['overall'].apply(_get_slope, raw=False).reset_index(level=0, drop=True)
    
    # Clean up temporary columns
    df = df.drop(columns=['time_in_days'])
    
    return df

# --- Imputation (No Changes) ---
def impute_features(df):
    print("\nImputing NaN values...")
    # Impute rate and frequency related columns
    for col in df.columns:
        if 'rate' in col or 'frequency' in col:
            df[col].fillna(0, inplace=True)
            df[col].replace([np.inf, -np.inf], 0, inplace=True)
    # Impute rolling stats with median
    for col in df.columns:
        if 'rolling' in col or 'trend' in col or 'ratio' in col:
             if pd.api.types.is_numeric_dtype(df[col]):
                df[col].fillna(df[col].median(), inplace=True)
    print("Imputation complete.")
    return df

# --- NEW: Function to Prepare Data for MLP ---
def prepare_data_for_mlp(df):
    """
    Prepares the final dataframe for MLP input.
    - Selects only the engineered features and the target variable.
    - One-hot encodes the 'category' column.
    - Separates features (X) from the target (y).
    """
    print("\nPreparing data for MLP input...")
    
    # Define target and features
    target_col = 'overall'
    feature_cols = [
        'review_arrival_rate', 'product_rolling_mean_rating', 'product_rolling_std_rating',
        'product_rating_trend', 'product_pos_neg_ratio', 'product_cumulative_reviews',
        'category_rolling_mean_rating', 'category_rolling_std_rating', 'category_rating_trend'
    ]
    
    # Add 'category' to be one-hot encoded
    df_for_mlp = df[feature_cols + ['category', target_col]].copy()
    
    # One-hot encode the 'category' column
    df_for_mlp = pd.get_dummies(df_for_mlp, columns=['category'], prefix='cat')
    
    # Separate features (X) and target (y)
    y_mlp = df_for_mlp[target_col]
    X_mlp = df_for_mlp.drop(columns=[target_col])
    
    print("MLP data preparation complete.")
    return X_mlp, y_mlp

   

In [5]:
all_samples = []
for file_path, category_name in FILE_INFO.items():
    # file_path = os.path.join(PATHSTR, file_name)
    if os.path.exists(file_path):
        sample = read_and_filter(file_path, category_name, sample_size=SAMPLE_SIZE)
        if not sample.empty:
            all_samples.append(sample)
    else:
        print(f"[ERROR] File does not exist: {file_path}")

if all_samples:
    df_combined = pd.concat(all_samples, ignore_index=True)
    print(f"\nSuccessfully loaded and combined data. Shape: {df_combined.shape}")
    
    # 1. Create temporal features (product and category only)
    final_df = create_temporal_features(df_combined, window_size=WINDOW_SIZE)
    
    # 2. Impute missing values
    final_df = impute_features(final_df)
    print(f"\nDataFrame with temporal features is ready. Shape: {final_df.shape}")
    
    # 3. Prepare the data for the MLP model
    X_mlp, y_mlp = prepare_data_for_mlp(final_df)
    
    print("\n--- MLP-Ready Features (X_mlp) ---")
    print(f"Shape: {X_mlp.shape}")
    print(X_mlp.head())
    
    print("\n--- MLP-Ready Target (y_mlp) ---")
    print(f"Shape: {y_mlp.shape}")
    print(y_mlp.head())
    
    print("\n--- NaN Check in MLP Features ---")
    print(X_mlp.isnull().sum().to_string())
else:
    print("No data was loaded. Exiting.")


-> Reading Cell_Phones_and_Accessories_5.json.gz...


-> Reading Magazine_Subscriptions_5.json.gz...
-> Reading Appliances_5 (1).json.gz...
-> Reading All_Beauty_5 (1).json.gz...
-> Reading AMAZON_FASHION_5 (1).json.gz...

Successfully loaded and combined data. Shape: (2025, 13)

Generating temporal features with window size: 30 (Product & Category only)...
Calculating product-level features...
Calculating category-level features...

Imputing NaN values...
Imputation complete.

DataFrame with temporal features is ready. Shape: (2025, 22)

Preparing data for MLP input...
MLP data preparation complete.

--- MLP-Ready Features (X_mlp) ---
Shape: (2025, 14)
   review_arrival_rate  product_rolling_mean_rating  \
0              0.00000                     5.000000   
1              0.03125                     3.000000   
2              0.04000                     3.666667   
3              1.00000                     4.000000   
4              1.00000                     4.200000   

   product_rolling_std_rating  product_rating_trend  product_

In [6]:
final_df["category"].value_counts()

category
Cell_Phones_and_Accessories    1000
Appliances                      828
All_Beauty                       98
AMAZON_FASHION                   98
Magazine_Subscriptions            1
Name: count, dtype: int64

In [7]:
final_df.shape

(2025, 22)

In [8]:
final_df.to_pickle("Temporal_Final_DF.pkl")

In [9]:
final_df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,...,category,review_arrival_rate,product_rolling_mean_rating,product_rolling_std_rating,product_rating_trend,product_pos_neg_ratio,product_cumulative_reviews,category_rolling_mean_rating,category_rolling_std_rating,category_rating_trend
0,5,True,"02 23, 2018",A22V1MD93T2FW9,B00006L9LC,{'Size:': ' Small'},Heather Sharp,I bought this for my husband. Hed been having ...,Really great shampoo for sensitive skin that h...,1519344000,...,All_Beauty,0.00000,5.000000,0.000000,-1.282377e-16,0.0,1,5.000000,1.093345,-1.282377e-16
1,1,True,"03 27, 2018",A2V608ILSK1M5R,B00006L9LC,{'Size:': ' Small'},CDART815,My product was not sealed and either used or s...,Beware,1522108800,...,All_Beauty,0.03125,3.000000,2.828427,-4.000000e+00,1.0,2,3.000000,2.828427,-4.000000e+00
2,5,True,"04 21, 2018",A1VN560NNZQIR0,B00006L9LC,{'Size:': ' Small'},Shablinska,Cleansing properties are above any praise! Sup...,The best treat for my hair!,1524268800,...,All_Beauty,0.04000,3.666667,2.309401,1.640150e-16,2.0,3,3.666667,2.309401,1.640150e-16
3,5,True,"04 22, 2018",A1L0QECT7J93ZP,B00006L9LC,{'Size:': ' Small'},Elena,Got this product for me and my daughter. I ca...,For any type of hair,1524355200,...,All_Beauty,1.00000,4.000000,2.000000,4.000000e-01,3.0,4,4.000000,2.000000,4.000000e-01
4,5,True,"04 23, 2018",AX0ZEGHH0H525,B00006L9LC,{'Size:': ' Small'},Aida A,Suffered from itchiness under my hair for coup...,Scalp-healing,1524441600,...,All_Beauty,1.00000,4.200000,1.788854,4.000000e-01,4.0,5,4.200000,1.788854,4.000000e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020,5,False,"05 15, 2017",AYKW6E1FFQAOA,B01HC81MT0,,Jeff,This is a great screen protector. Installation...,BEST WET INSTALL SCREEN PROTECTOR EVER!,1494806400,...,Cell_Phones_and_Accessories,0.00000,5.000000,0.000000,-1.282377e-16,0.0,1,4.000000,1.462167,1.557286e-02
2021,4,True,"01 25, 2017",A3H1XY9QEPSML7,B01HCH03HS,{'Color:': ' Black/Clear'},Desiree,So love the mean face case,Don't Touch Me !,1485302400,...,Cell_Phones_and_Accessories,0.00000,4.000000,0.000000,-1.282377e-16,0.0,1,3.966667,1.449931,2.246941e-02
2022,5,False,"07 19, 2016",A20P5W3NEE7CQ3,B01HGSOZFY,{'Color:': ' White & Blue'},JL,I received this set of two USB 2.0 wall charge...,Charges fine without any whine and doesn't get...,1468886400,...,Cell_Phones_and_Accessories,0.00000,5.000000,0.000000,-1.282377e-16,0.0,1,4.066667,1.412587,1.557286e-02
2023,5,True,"01 30, 2017",A3M8S9Z2LJLYPJ,B01HIJESIK,,Aisha,Does what it's supposed to do.\n\nI saw a lot ...,What can I say,1485734400,...,Cell_Phones_and_Accessories,0.00000,5.000000,0.000000,-1.282377e-16,0.0,1,4.066667,1.412587,2.803115e-02
