In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import datetime

In [4]:
import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [5]:
RAW_DATA_FOLDER = Path("../data/raw/")
PROCESSED_DATA_FOLDER = Path("../data/processed/")

spain_sales_raw = RAW_DATA_FOLDER / "spanish_sales.csv"

In [6]:
sales_df = pd.read_csv(spain_sales_raw, parse_dates=True,dtype={"customer_id": int, "product_id": int})

In order to have a uniformed way with rating let us add a dummy data raw `ratings`

In [7]:
sales_df['rating'] = 1.0
sales_df.date = pd.to_datetime(sales_df.date)

In [8]:
sales_df.rename(columns={'customer_id':'uid',
                          'product_id':'mid',
                          'date':'timestamp'}, 
                 inplace=True)

In [9]:
sales_df.head()

Unnamed: 0,uid,mid,timestamp,rating
0,9134527,386652,2019-05-06,1.0
1,9134527,386652,2019-05-06,1.0
2,9134527,464466,2019-05-06,1.0
3,9134527,55815,2019-05-06,1.0
4,9134527,55815,2019-05-06,1.0


In [10]:
sales_df.dtypes

uid                   int64
mid                   int64
timestamp    datetime64[ns]
rating              float64
dtype: object

In [11]:
def load_data(data_dir, MIN_RATINGS=4):
    ml_dir = data_dir
    sales_df = pd.read_csv(spain_sales_raw, parse_dates=True,dtype={"customer_id": int, "product_id": int})
    sales_df['ratings'] = 1.0
    sales_df.date = pd.to_datetime(sales_df.date)
    sales_df.rename(columns={'customer_id':'uid',
                          'product_id':'mid',
                          'date':'timestamp'}, 
                 inplace=True)
    df = sales_df.copy()

    # first let us filter out the users with less than MIN_RATINGS interations
    logger.info(
        "Filtering out users with less than {} ratings".format(MIN_RATINGS))
    grouped = df.groupby('uid')
    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS).copy()

    # now let us factoriyze (re-index users)
    logger.info("Mapping original user and item IDs to new sequential IDs")
    df['userId'] = pd.factorize(df['uid'])[0]
    df['itemId'] = pd.factorize(df['mid'])[0]

    logger.info('Range of userId is [{}, {}]'.format(
        df.userId.min(), df.userId.max()))
    logger.info('Range of itemId is [{}, {}]'.format(
        df.userId.min(), df.itemId.max()))

    num_users = len(df['userId'].unique())
    num_items = len(df['itemId'].unique())
    logger.info("num_users is {}, num_items is {}".format(
        num_users, num_items))

    return df, num_users, num_items


In [12]:
data, num_users, num_items = load_data(spain_sales_raw)

INFO:__main__:Filtering out users with less than 4 ratings
INFO:__main__:Mapping original user and item IDs to new sequential IDs
INFO:__main__:Range of userId is [0, 204100]
INFO:__main__:Range of itemId is [0, 6112]
INFO:__main__:num_users is 204101, num_items is 6113


In [13]:
data.shape

(3299776, 6)

In [27]:
item_pool = set(data['itemId'].unique())
item_pool_np = np.array(data['itemId'].unique())

In [40]:
len(item_pool)

6113

In [34]:
import random

In [35]:
def sample_negatives_low_mem(ratings):
    """return all negative items & 100 sampled negative items"""
    logger.info("sampling negatives with low mem")
    # raitings = dd.from_pandas(ratings,npartitions=4)

    def sample_negatives_per_user(x):
        set_interacted = set(x)
        set_non_interacted = item_pool.difference(set_interacted)
        if len(set_interacted) < len(item_pool):
            print(set_interacted)
        return random.sample(set_interacted, 99)

    interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
        columns={'itemId': 'interacted_items'})
    # interact_status['negative_items'] = interact_status['interacted_items'].apply(
    #     lambda x: self.item_pool - x)
    interact_status['negative_samples'] = interact_status['interacted_items'].apply(sample_negatives_per_user)
    return interact_status[['userId', 'negative_samples']]

In [36]:
sample_negatives_low_mem(data)

INFO:__main__:sampling negatives with low mem


ValueError: Sample larger than population or is negative

In [14]:
data['rank_latest'] = data.groupby(['userId'])['timestamp'].rank(
    method='first', ascending=False)

In [15]:
from tqdm import tqdm

In [16]:
tqdm.pandas()

In [17]:
interact_status = data.groupby('userId')['itemId'].apply(np.unique).reset_index().rename(
    columns={'itemId': 'interacted_items'})

In [19]:
item_pool_np

array([   0,    1,    2, ..., 6110, 6111, 6112])

In [20]:
interact_status_np = interact_status.to_numpy()

In [21]:
interact_status_np

array([[0,
        array([   0,    1,    2,    3,  131,  319,  389,  551,  703, 1189, 1330,
       1547, 2030, 3083, 3636, 3741, 3822, 4062, 4240, 4426, 4836, 5027,
       5387])],
       [1, array([  4, 254, 280, 372, 934])],
       [2,
        array([   5,    6,    7,    8,   64,   75,   77,   80,   84,  131,  140,
        172,  294,  345,  441,  450,  514,  649,  665,  698,  765,  777,
        892, 1037, 1189, 1253, 1324, 1341, 1345, 1351, 1384, 1870, 1892,
       2022, 2435, 2524, 2958, 2996, 3333, 3575, 3672, 3923, 3957, 4059,
       4894, 5059, 5242, 5377, 5636])],
       ...,
       [204098, array([  49,  605,  696, 1321, 1826, 2279, 2718])],
       [204099, array([ 152, 1180, 1955, 5274])],
       [204100, array([ 282, 1247, 2710, 3871, 4262])]], dtype=object)

In [22]:
item_pool_np

array([   0,    1,    2, ..., 6110, 6111, 6112])

In [24]:
#!!!!!!!
np.repeat(item_pool_np[np.newaxis, :], interact_status_np.shape[0], axis=0)

array([[   0,    1,    2, ..., 6110, 6111, 6112],
       [   0,    1,    2, ..., 6110, 6111, 6112],
       [   0,    1,    2, ..., 6110, 6111, 6112],
       ...,
       [   0,    1,    2, ..., 6110, 6111, 6112],
       [   0,    1,    2, ..., 6110, 6111, 6112],
       [   0,    1,    2, ..., 6110, 6111, 6112]])

In [25]:
np.setdiff1d(interact_status_np[:,2], np.repeat(item_pool_np[np.newaxis, :], interact_status_np.shape[0], axis=0))

IndexError: index 2 is out of bounds for axis 1 with size 2

In [None]:
# blows out of 32 GB
np.hstack((interact_status_np, np.repeat(item_pool_np[np.newaxis, :], interact_status_np.shape[0], axis=0)))