In [1]:
import os
from pathlib import Path
import requests
import zipfile

import numpy as np
import polars as pl

from sklearn.neighbors import NearestNeighbors

DATASET_LOCATION='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATASET_HASH='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip.md5'
dst_d = Path.cwd() / '..' / 'data' 

In [2]:
def download_lens_data(source,destination):
    """Downloads movie lens data
    """
    dst_f = Path(destination) / Path(source).name
    r = requests.get(source)
    if r.ok:
        with open(dst_f,"wb") as f:
            f.write(r.content)
    return dst_f

downloaded_files = download_lens_data(DATASET_LOCATION, dst_d)

In [None]:
def unzip_files(zipped_file,destination):
    """Unzips a movie lens data set file
    """
    zip_dir = Path(zipped_file).stem
    needed_files = ["ratings.csv", "movies.csv"]
    needed_files = [ Path(zip_dir) / Path(f) for f in needed_files]
    needed_files = [ Path(destination) / f for f in needed_files ]
    
    with zipfile.ZipFile(zipped_file,'r') as z:
        z.extractall(destination)
    return needed_files

needed_files = unzip_files(downloaded_files, dst_d)
[os.path.exists(f) for f in needed_files]


In [None]:
def generate_user_rating_matrix(ratings_csv, movies_csv):
    combined_df = generate_individual_rating_list(ratings_csv, movies_csv)
    pivot_table = combined_df.pivot(index="userId",columns="movieId",values="rating").fill_null(0)
    del combined_df
    pivot_table.shrink_to_fit()
    return pivot_table

def generate_individual_rating_list(ratings_csv, movies_csv):
    ratings_df = pl.read_csv(ratings_csv)
    ratings_df.drop_in_place("timestamp")
    movies_df = pl.read_csv(movies_csv)
    movies_df.drop_in_place("genres")
    combined_df = ratings_df.join(movies_df, on="movieId")
    combined_df.shrink_to_fit()
    del movies_df
    del ratings_df
    return combined_df



user_by_movie = generate_user_rating_matrix(needed_files[0],needed_files[1])
movie_by_user = user_by_movie.drop('userId') \
                .transpose(include_header=True,header_name = "movieId") \
                .with_columns(pl.col("movieId") \
                .cast(pl.Int32) \
                .alias("movieId"))

user_by_movie.shape, movie_by_user.shape

In [5]:


user_similarity_model = NearestNeighbors(metric='cosine').fit(user_by_movie.drop('userId').to_numpy())


                    
movie_similarity_model = NearestNeighbors(metric='cosine').fit(movie_by_user.drop('movieId').to_numpy())



In [None]:
print(user_similarity_model.kneighbors(user_by_movie[30,].drop('userId').to_numpy(),n_neighbors = 30))

In [None]:
print(movie_similarity_model.kneighbors(movie_by_user[7,].drop('movieId').to_numpy(),n_neighbors=30))

In [None]:
movie_by_user.filter(pl.col("movieId").is_in([  7,  26, 478,  25,  20,  16, 472, 463,  32, 232,  33,   3,  34,
         17,  22, 121, 621, 549, 464,  42,  19, 587, 166,  28, 531, 471,
        633,  73, 322, 539]))

In [None]:
for r in movie_by_user.rows():
    print (r[0], f"'{list(r[1:])}'")
    break

In [35]:
movie_by_user.write_parquet('test')

x = pl.read_parquet('test')

In [None]:
x.head()