### Setup

In [1]:
import sys
import os
import numpy as np

sys.path.append(os.path.abspath(".."))

from src.data_processing import (
    process_csv_to_numpy, 
    save_processed_data,
    filter_k_core,
    split_train_test,
    add_time_features,
    standardize_ratings,
    perform_hypothesis_test
)

### Process raw data

In [2]:
RAW_DATA_PATH = "../data/raw/ratings_Beauty.csv"
PROCESSED_FOLDER = "../data/processed"

data, user_map, product_map, corrupted_rows = process_csv_to_numpy(RAW_DATA_PATH)
if data is None:
    print("Data processing failed")
else:
    save_processed_data(data, user_map, product_map, PROCESSED_FOLDER)

Starting to process CSV file: ../data/raw/ratings_Beauty.csv
Converting data to NumPy array.
Saved data matrix to ../data/processed\data_processed.npy
Saved user and product maps to ../data/processed


### Check corrupted_rows

In [3]:
print(f"Corrupted size: {len(corrupted_rows)}")

Corrupted size: 0


In [4]:
print(f"Corrupted: {corrupted_rows}")

Corrupted: []


### Check data

In [5]:
print(f"Total data rows: {len(data)}")

Total data rows: 2023070


In [6]:
print(f"Data before k-core filtering: {data.shape}")

Data before k-core filtering: (2023070, 4)


### K-Core Filtering

In [7]:
cleaned_data = filter_k_core(data, k=5)

In [8]:
print(f"Data after k-core filtering shape: {cleaned_data.shape}")

Data after k-core filtering shape: (198502, 4)


In [9]:
print(f"Data after k-core filtering: {cleaned_data[:5]}")

Data after k-core filtering: [[2.9300000e+02 1.1500000e+02 1.0000000e+00 1.3910400e+09]
 [3.0000000e+02 1.1500000e+02 3.0000000e+00 1.3977792e+09]
 [3.0200000e+02 1.1500000e+02 4.0000000e+00 1.3784256e+09]
 [3.1300000e+02 1.1500000e+02 2.0000000e+00 1.3864608e+09]
 [3.1400000e+02 1.1500000e+02 3.0000000e+00 1.3821408e+09]]


### Feature Engineering

In [10]:
enhanced_data = add_time_features(cleaned_data) # [user_id, product_id, rating, timestamp, year, weight]

In [11]:
print(f"Data shape after feature addition: {enhanced_data.shape}")

Data shape after feature addition: (198502, 6)


In [12]:
print(f"Data after feature addition: {enhanced_data[:5]}")

Data after feature addition: [[2.93000000e+02 1.15000000e+02 1.00000000e+00 1.39104000e+09
  2.01410959e+03 9.68535262e-01]
 [3.00000000e+02 1.15000000e+02 3.00000000e+00 1.39777920e+09
  2.01432329e+03 9.82640145e-01]
 [3.02000000e+02 1.15000000e+02 4.00000000e+00 1.37842560e+09
  2.01370959e+03 9.42133816e-01]
 [3.13000000e+02 1.15000000e+02 2.00000000e+00 1.38646080e+09
  2.01396438e+03 9.58951175e-01]
 [3.14000000e+02 1.15000000e+02 3.00000000e+00 1.38214080e+09
  2.01382740e+03 9.49909584e-01]]


### Hypothesis testing

In [13]:
perform_hypothesis_test(enhanced_data, year_a=2013, year_b=2014, confidence_level=0.95)


Performing Z-Test: Ratings in 2014 > 2013?
Hypothesis: H0: Mean_B <= Mean_A | H1: Mean_B > Mean_A
Confidence Level: 95.0%
   2013: Mean=4.1963, N=84706
   2014: Mean=4.2118, N=60945
   Z-Score: 2.5212
   Result: Reject H0. Statistically Significant.


### Standardization

In [18]:
final_data, mean_rating, std_rating = standardize_ratings(enhanced_data)

In [19]:
print(f"Final data after standardization: {final_data[:5]}")

Final data after standardization: [[ 2.93000000e+02  1.15000000e+02 -2.73483207e+00  1.39104000e+09
   2.01410959e+03  9.68535262e-01]
 [ 3.00000000e+02  1.15000000e+02 -1.02041396e+00  1.39777920e+09
   2.01432329e+03  9.82640145e-01]
 [ 3.02000000e+02  1.15000000e+02 -1.63204913e-01  1.37842560e+09
   2.01370959e+03  9.42133816e-01]
 [ 3.13000000e+02  1.15000000e+02 -1.87762302e+00  1.38646080e+09
   2.01396438e+03  9.58951175e-01]
 [ 3.14000000e+02  1.15000000e+02 -1.02041396e+00  1.38214080e+09
   2.01382740e+03  9.49909584e-01]]


In [20]:
print(f"Mean rating: {mean_rating}, Std rating: {std_rating}")

Mean rating: 4.190391028805755, Std rating: 1.1665765757272069


### Split train/test set

In [21]:
train_data, test_data = split_train_test(final_data, train_ratio=0.8)

In [22]:
print(f"Train set shape: {train_data.shape}")
print(f"Test set shape: {test_data.shape}")

Train set shape: (158801, 6)
Test set shape: (39701, 6)


### Save data

In [23]:
np.save(os.path.join(PROCESSED_FOLDER, "train_data.npy"), train_data)
np.save(os.path.join(PROCESSED_FOLDER, "test_data.npy"), test_data)