In [64]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
RAW_PATH = PROJECT_ROOT / "data"
PROCESSED_PATH = RAW_PATH / "processed"
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

# ratings.dat
ratings = pd.read_csv(RAW_PATH / "ratings.dat", delimiter=',')
gender = pd.read_csv(RAW_PATH / "gender.dat", delimiter=',')
ratings_test = pd.read_csv(RAW_PATH / "ratings-Test.dat", delimiter=',')

# Data types


## Raw memory usage 

In [65]:
print("ratings")
ratings.info(memory_usage='deep')

print("\ngender")
gender.info(memory_usage='deep')

ratings
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220037 entries, 0 to 3220036
Data columns (total 3 columns):
 #   Column     Dtype
---  ------     -----
 0   userID     int64
 1   profileID  int64
 2   rating     int64
dtypes: int64(3)
memory usage: 73.7 MB

gender
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220969 entries, 0 to 220968
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   1       220969 non-null  int64 
 1   F       220969 non-null  object
dtypes: int64(1), object(1)
memory usage: 13.9 MB


## Defining a custom data type
Trying to define a custom data type to better fit the data with less memory usage. 
However, the custom data type must be compatible torch.

In [66]:
dtype_map = {
    "userID": "int64",        # required by nn.Embedding → torch.long ~ 8 bytes
    "profileID": "int64",     # same ~ 8 bytes
    "rating": "float32",      # for training with MSELoss → torch.float32 ~ 4 bytes
    "gender": "category"      # for training with CrossEntropyLoss → torch.int64 ~ 8 bytes
}

In [67]:
ratings = pd.read_csv(RAW_PATH / "ratings.dat", delimiter=',', dtype=dtype_map)
gender = pd.read_csv(RAW_PATH / "gender.dat", header=None, names=["userID", "gender"], delimiter=',', dtype=dtype_map)
ratings_test = pd.read_csv(RAW_PATH / "ratings-Test.dat", delimiter=',', dtype=dtype_map)

In [68]:
ratings.info(memory_usage='deep')
print("\n")
gender.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220037 entries, 0 to 3220036
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userID     int64  
 1   profileID  int64  
 2   rating     float32
dtypes: float32(1), int64(2)
memory usage: 61.4 MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220970 entries, 0 to 220969
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   userID  220970 non-null  int64   
 1   gender  220970 non-null  category
dtypes: category(1), int64(1)
memory usage: 1.9 MB


# Save the pickle files

In [None]:
ratings.to_pickle(PROCESSED_PATH / "proc_ratings.pkl")
gender.to_pickle(PROCESSED_PATH / "proc_gender.pkl")
ratings_test.to_pickle(PROCESSED_PATH / "proc_ratings_test.pkl")