In [1]:
import sys

sys.path.append("../")

In [2]:
import os

try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve

from tqdm.auto import tqdm

In [3]:
import h5py
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix

#### Loading data stored in HDF5 format

In [4]:
URL_BASE = "https://github.com/benfred/recommender_data/releases/download/v1.0/"
LOCAL_CACHE_DIR = os.path.join("../", "data")

In [6]:
def download_file(url, local_filename):
    """Simple wrapper around urlretrieve that uses tqdm to display a progress
    bar of download progress"""
    
    local_filename = os.path.abspath(local_filename)
    path = os.path.dirname(local_filename)
    if not os.path.isdir(path):
        os.makedirs(path)

    with tqdm(unit="B", unit_scale=True) as progress:

        def report(chunk, chunksize, total):
            progress.total = total
            progress.update(chunksize)

        return urlretrieve(url, local_filename, reporthook=report)

In [11]:
def get_movielens(variant="20m"):
    """Gets movielens datasets

    Parameters
    ---------
    variant : string
        Which version of the movielens dataset to download. Should be one of '20m', '10m',
        '1m' or '100k'.

    Returns
    -------
    movies : ndarray
        An array of the movie titles.
    ratings : csr_matrix
        A sparse matrix where the row is the movieId, the column is the userId and the value is
        the rating.
    """
    filename = f"movielens_{variant}.hdf5"

    path = os.path.join(LOCAL_CACHE_DIR, filename)
    if not os.path.isfile(path):
        print(f"Downloading dataset to {path}")
        download_file(URL_BASE + filename, path)
    else:
        print(f"Using cached dataset at {path}")

    with h5py.File(path, "r") as f:
        m = f.get("movie_user_ratings")
        plays = csr_matrix((m.get("data"), m.get("indices"), m.get("indptr")))
        return np.array(f["movie"].asstr()[:]), plays

In [12]:
ml_20 = get_movielens(variant="20m")

Using cached dataset at ../data\movielens_20m.hdf5


In [13]:
print(ml_20)

(array(['', 'Toy Story (1995)', 'Jumanji (1995)', ...,
       'Rentun Ruusu (2001)', '', 'Innocence (2014)'], dtype=object), <131263x138494 sparse matrix of type '<class 'numpy.float32'>'
	with 20000263 stored elements in Compressed Sparse Row format>)


#### Exploring movielens in CSR format

In [15]:
ratings = ml_20[1]

print(f"Number of stored non-zero elements: {ratings.size}")
print(f"Stored data: {ratings.data} (of lenght {len(ratings.data)})")
print(f"Сolumn indices of stored elements: {ratings.indices}")
print(f"Cumulative counter of non-zero elements over rows: {ratings.indptr}")

Number of stored non-zero elements: 20000263
Stored data: [4.  5.  4.  ... 2.5 3.  4. ] (of lenght 20000263)
Сolumn indices of stored elements: [     3      6      8 ...  28906  65409 133047]
Cumulative counter of non-zero elements over rows: [       0        0    49695 ... 20000262 20000262 20000263]


In [24]:
# Slicing, casting to numpy

ratings_s = ratings[:10, :10]
print(f"Size: {ratings_s.size}")
ratings_s.toarray()             # .todense() -> same, but returns np.matrix

Size: 13


array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 4. , 0. , 0. , 5. , 0. , 4. , 0. ],
       [0. , 3.5, 0. , 0. , 0. , 3. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 4. , 0. , 0. , 0. , 3. , 3. , 5. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 3. , 0. , 0. , 0. , 3. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 5. , 3. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]], dtype=float32)

In [30]:
# Iterating (unclear)  -> seems to iterate regardless of rows 

for el in ratings_s: print(el)


  (0, 3)	4.0
  (0, 6)	5.0
  (0, 8)	4.0
  (0, 1)	3.5
  (0, 5)	3.0
  (0, 2)	4.0
  (0, 6)	3.0
  (0, 7)	3.0
  (0, 8)	5.0


  (0, 4)	3.0
  (0, 8)	3.0
  (0, 6)	5.0
  (0, 7)	3.0




In [34]:
# Iterating (more clear way)

from itertools import chain, repeat


def iter_csr(matrix):
  for (row, col, val) in zip(
    chain(*(repeat(i, r) for (i,r) in enumerate(matrix.indptr[1:] - matrix.indptr[:-1]))),
    matrix.indices,
    matrix.data
  ):
    yield (row, col, val)

In [35]:
for el in iter_csr(ratings_s): print(el)

(1, 3, 4.0)
(1, 6, 5.0)
(1, 8, 4.0)
(2, 1, 3.5)
(2, 5, 3.0)
(3, 2, 4.0)
(3, 6, 3.0)
(3, 7, 3.0)
(3, 8, 5.0)
(6, 4, 3.0)
(6, 8, 3.0)
(7, 6, 5.0)
(7, 7, 3.0)


In [39]:
# Binarizing

(ratings_s > 3).astype(int).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])