In [1]:
from functools import partial
import os
from pathlib import Path
import requests
import zipfile

import numpy as np
import polars as pl

DATASET_LOCATION='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATASET_HASH='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip.md5'
dst_d = Path.cwd() / '..' / 'data' 

In [2]:
def download_lens_data(source,destination):
    """Downloads movie lens data
    """
    dst_f = Path(destination) / Path(source).name
    r = requests.get(source)
    if r.ok:
        with open(dst_f,"wb") as f:
            f.write(r.content)
    return dst_f

downloaded_files = download_lens_data(DATASET_LOCATION, dst_d)

def unzip_files(zipped_file,destination):
    """Unzips a movie lens data set file
    """
    zip_dir = Path(zipped_file).stem
    needed_files = ["ratings.csv", "movies.csv"]
    needed_files = [ Path(zip_dir) / Path(f) for f in needed_files]
    needed_files = [ Path(destination) / f for f in needed_files ]
    
    with zipfile.ZipFile(zipped_file,'r') as z:
        z.extractall(destination)
    return needed_files

needed_files = unzip_files(downloaded_files, dst_d)
[os.path.exists(f) for f in needed_files]

[True, True]

In [3]:
def generate_individual_rating_list(ratings_csv, movies_csv):
    ratings_df = pl.read_csv(ratings_csv)
    ratings_df.drop_in_place("timestamp")
    movies_df = pl.read_csv(movies_csv)
    movies_df.drop_in_place("genres")
    combined_df = ratings_df.join(movies_df, on="movieId")
    combined_df.shrink_to_fit()
    del movies_df
    del ratings_df
    return combined_df



user_ratings = generate_individual_rating_list(needed_files[0],needed_files[1])
user_ratings.head()
    

userId,movieId,rating,title
i64,i64,f64,str
1,1,4.0,"""Toy Story (199…"
1,3,4.0,"""Grumpier Old M…"
1,6,4.0,"""Heat (1995)"""
1,47,5.0,"""Seven (a.k.a. …"
1,50,5.0,"""Usual Suspects…"


In [None]:
def generate_movie_encoding(length, set):
    l = np.zeros(length,np.int8)
    if not isinstance(set,list):
        set = [set]
    set = [s-1 for s in set]
    np.put(l, set, 1 )
    return l.tolist()

largest_id = user_ratings.select(pl.max('movieId')).to_numpy()[0][0]

encode_movie = partial(generate_movie_encoding,largest_id)


In [None]:
def generate_user_encoding(user_id):
        return user_by_movie.filter(pl.col("userId") == user_id).drop("userId").to_numpy()[0].tolist()
uid_encoding_table = pl.from_dict({ str(u) : generate_user_encoding(u) for u in user_by_movie.select(["userId"]).unique().to_dict().get('userId').to_list()}).transpose(include_header=True, header_name = "userId").with_columns(pl.col("userId").cast(pl.Int16).alias("userId"))