In [1]:
from functools import partial
import os
from pathlib import Path
import requests
import zipfile

import numpy as np
import polars as pl

DATASET_LOCATION='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATASET_HASH='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip.md5'
dst_d = Path.cwd() / '..' / 'data' 

In [2]:
def download_lens_data(source,destination):
    """Downloads movie lens data
    """
    dst_f = Path(destination) / Path(source).name
    r = requests.get(source)
    if r.ok:
        with open(dst_f,"wb") as f:
            f.write(r.content)
    return dst_f

downloaded_files = download_lens_data(DATASET_LOCATION, dst_d)

In [3]:
def unzip_files(zipped_file,destination):
    """Unzips a movie lens data set file
    """
    zip_dir = Path(zipped_file).stem
    needed_files = ["ratings.csv", "movies.csv"]
    needed_files = [ Path(zip_dir) / Path(f) for f in needed_files]
    needed_files = [ Path(destination) / f for f in needed_files ]
    
    with zipfile.ZipFile(zipped_file,'r') as z:
        z.extractall(destination)
    return needed_files

needed_files = unzip_files(downloaded_files, dst_d)
[os.path.exists(f) for f in needed_files]


[True, True]

In [None]:
def generate_user_rating_matrix(ratings_csv, movies_csv):
    combined_df = generate_individual_rating_list(ratings_csv, movies_csv)
    pivot_table = combined_df.pivot(index="userId",columns="movieId",values="rating").fill_null(0)
    del combined_df
    pivot_table.shrink_to_fit()
    return pivot_table

def generate_individual_rating_list(ratings_csv, movies_csv):
    ratings_df = pl.read_csv(ratings_csv)
    ratings_df.drop_in_place("timestamp")
    movies_df = pl.read_csv(movies_csv)
    movies_df.drop_in_place("genres")
    combined_df = ratings_df.join(movies_df, on="movieId")
    combined_df.shrink_to_fit()
    del movies_df
    del ratings_df
    return combined_df



user_by_movie = generate_user_rating_matrix(needed_files[0],needed_files[1])
movie_by_user = user_by_movie.drop('userId') \
                .transpose(include_header=True,header_name = "movieId") \
                .with_columns(pl.col("movieId") \
                .cast(pl.Int32) \
                .alias("movieId"))

user_by_movie.head()

In [53]:
from sklearn.neighbors import NearestNeighbors

user_similarity_model = NearestNeighbors(metric='cosine').fit(user_by_movie.drop('userId').to_numpy())


                    
movie_similarity_model = NearestNeighbors(metric='cosine').fit(movie_by_user.drop('movieId').to_numpy())



(array([[0.        , 0.52704923, 0.53062245, 0.53836875, 0.55042967,
         0.57298059, 0.58257046, 0.58345172, 0.58746518, 0.58981538]]),
 array([[ 30,  22, 532, 314, 659, 472,  33, 479,  37, 321]]))

In [56]:
print(user_similarity_model.kneighbors(user_by_movie[30,].drop('userId').to_numpy(),n_neighbors = 30))

(array([[0.        , 0.61850506, 0.65826204, 0.66724988, 0.66758168,
        0.67610218, 0.67687305, 0.68597053, 0.69128506, 0.69164247,
        0.70532662, 0.70712772, 0.70845024, 0.71377231, 0.72897932,
        0.74213   , 0.74959704, 0.75154782, 0.7515576 , 0.75206405,
        0.7535064 , 0.75435387, 0.76309085, 0.76735666, 0.77140117,
        0.77439913, 0.77702458, 0.78016313, 0.78286103, 0.78565483]]), array([[ 30,  70, 523, 269, 205, 213,  31, 388, 149, 275, 450, 303, 528,
        363, 371, 349, 170, 455, 384, 268, 569, 336,  77,  83, 346, 119,
        276, 313, 150, 219]]))


In [63]:
print(movie_similarity_model.kneighbors(movie_by_user[7,].drop('movieId').to_numpy(),n_neighbors=30))

(array([[0.        , 0.3305461 , 0.340173  , 0.35464099, 0.35691009,
        0.37237884, 0.37266198, 0.4016587 , 0.40257025, 0.4026831 ,
        0.40809108, 0.40957609, 0.42769957, 0.43196691, 0.43407968,
        0.43573732, 0.43931796, 0.44860449, 0.46656139, 0.46906173,
        0.47302041, 0.47342152, 0.47423354, 0.48286965, 0.48640309,
        0.48719359, 0.48843345, 0.49157234, 0.49848574, 0.49983288]]), array([[  7,  26, 478,  25,  20,  16, 472, 463,  32, 232,  33,   3,  34,
         17,  22, 121, 621, 549, 464,  42,  19, 587, 166,  28, 531, 471,
        633,  73, 322, 539]]))


In [64]:
movies_df.filter(pl.col("movieId").is_in([  7,  26, 478,  25,  20,  16, 472, 463,  32, 232,  33,   3,  34,
         17,  22, 121, 621, 549, 464,  42,  19, 587, 166,  28, 531, 471,
        633,  73, 322, 539]))

movieId,title,genres
i64,str,str
3,"""Grumpier Old M…","""Comedy|Romance…"
7,"""Sabrina (1995)…","""Comedy|Romance…"
16,"""Casino (1995)""","""Crime|Drama"""
17,"""Sense and Sens…","""Drama|Romance"""
19,"""Ace Ventura: W…","""Comedy"""
20,"""Money Train (1…","""Action|Comedy|…"
22,"""Copycat (1995)…","""Crime|Drama|Ho…"
25,"""Leaving Las Ve…","""Drama|Romance"""
26,"""Othello (1995)…","""Drama"""
28,"""Persuasion (19…","""Drama|Romance"""


In [None]:
+