### Setup

In [None]:
import sys
import os
import numpy as np

sys.path.append(os.path.abspath(".."))

from src.data_processing import (
    process_csv_to_numpy, 
    save_processed_data,
    filter_k_core  
)

### Process raw data

In [2]:
RAW_DATA_PATH = "../data/raw/ratings_Beauty.csv"
PROCESSED_FOLDER = "../data/processed"

data, user_map, product_map, corrupted_rows = process_csv_to_numpy(RAW_DATA_PATH)
if data is None:
    print("Data processing failed")
else:
    save_processed_data(data, user_map, product_map, PROCESSED_FOLDER)

Starting to process CSV file: ../data/raw/ratings_Beauty.csv
Converting data to NumPy array.
Saved data matrix to ../data/processed\ratings_data.npy
Saved user and product maps to ../data/processed


### Check corrupted_rows

In [3]:
print(f"Corrupted size: {len(corrupted_rows)}")

Corrupted size: 0


In [4]:
print(f"Corrupted: {corrupted_rows}")

Corrupted: []


### Check data

In [5]:
print(f"Total data rows: {len(data)}")

Total data rows: 2023070


In [6]:
print(f"Data shape: {data.shape}")

Data shape: (2023070, 4)


### K-Core Filtering

In [8]:
cleaned_data = filter_k_core(data, k=5)

### Split train/test set

In [9]:
np.random.seed(42)
np.random.shuffle(cleaned_data)

train_size = int(0.8 * len(cleaned_data))
train_data = cleaned_data[:train_size]
test_data = cleaned_data[train_size:]

In [10]:
print(f"Train set shape: {train_data.shape}")
print(f"Test set shape: {test_data.shape}")

Train set shape: (158801, 4)
Test set shape: (39701, 4)


### FINAL DATASETS

In [11]:
train_path = os.path.join(PROCESSED_FOLDER, "train_data.npy")
test_path = os.path.join(PROCESSED_FOLDER, "test_data.npy")

In [12]:
np.save(train_path, train_data)
np.save(test_path, test_data)