In [1]:
from functools import partial
import os
from pathlib import Path
import requests
import zipfile

import numpy as np
import polars as pl

from sklearn.neighbors import NearestNeighbors

DATASET_LOCATION='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATASET_HASH='https://files.grouplens.org/datasets/movielens/ml-latest-small.zip.md5'
dst_d = Path.cwd() / '..' / 'data' 

In [2]:
def download_lens_data(source,destination):
    """Downloads movie lens data
    """
    dst_f = Path(destination) / Path(source).name
    r = requests.get(source)
    if r.ok:
        with open(dst_f,"wb") as f:
            f.write(r.content)
    return dst_f

downloaded_files = download_lens_data(DATASET_LOCATION, dst_d)

In [3]:
def unzip_files(zipped_file,destination):
    """Unzips a movie lens data set file
    """
    zip_dir = Path(zipped_file).stem
    needed_files = ["ratings.csv", "movies.csv"]
    needed_files = [ Path(zip_dir) / Path(f) for f in needed_files]
    needed_files = [ Path(destination) / f for f in needed_files ]
    
    with zipfile.ZipFile(zipped_file,'r') as z:
        z.extractall(destination)
    return needed_files

needed_files = unzip_files(downloaded_files, dst_d)
[os.path.exists(f) for f in needed_files]


[True, True]

In [4]:
def generate_user_rating_matrix(ratings_csv, movies_csv):
    combined_df = generate_individual_rating_list(ratings_csv, movies_csv)
    pivot_table = combined_df.pivot(index="userId",columns="movieId",values="rating").fill_null(0)
    del combined_df
    pivot_table.shrink_to_fit()
    return pivot_table

def generate_individual_rating_list(ratings_csv, movies_csv):
    ratings_df = pl.read_csv(ratings_csv)
    ratings_df.drop_in_place("timestamp")
    movies_df = pl.read_csv(movies_csv)
    movies_df.drop_in_place("genres")
    combined_df = ratings_df.join(movies_df, on="movieId")
    combined_df.shrink_to_fit()
    del movies_df
    del ratings_df
    return combined_df



user_by_movie = generate_user_rating_matrix(needed_files[0],needed_files[1])
movie_by_user = user_by_movie.drop('userId') \
                .transpose(include_header=True,header_name = "movieId") \
                .with_columns(pl.col("movieId") \
                .cast(pl.Int32) \
                .alias("movieId"))

user_by_movie.head()

userId,1,3,6,47,50,70,101,110,151,157,163,216,223,231,235,260,296,316,333,349,356,362,367,423,441,457,480,500,527,543,552,553,590,592,593,596,…,114044,114670,114707,115727,117867,118082,128838,129313,130050,130052,130840,133832,135534,135803,135815,138610,138632,139511,139655,140267,141400,141799,142366,142598,145951,146309,147657,147662,148166,149011,152372,158721,160341,160527,160836,163937,163981
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,5.0,5.0,3.0,5.0,4.0,5.0,3.0,3.0,5.0,4.0,4.0,5.0,4.0,3.0,4.0,5.0,4.0,3.0,5.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,5.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,3.0,0.0,5.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:


user_similarity_model = NearestNeighbors(metric='cosine').fit(user_by_movie.drop('userId').to_numpy())


                    
movie_similarity_model = NearestNeighbors(metric='cosine').fit(movie_by_user.drop('movieId').to_numpy())



In [6]:
print(user_similarity_model.kneighbors(user_by_movie[30,].drop('userId').to_numpy(),n_neighbors = 30))

(array([[0.        , 0.61850506, 0.65826204, 0.66724988, 0.66758168,
        0.67610218, 0.67687305, 0.68597053, 0.69128506, 0.69164247,
        0.70532662, 0.70712772, 0.70845024, 0.71377231, 0.72897932,
        0.74213   , 0.74959704, 0.75154782, 0.7515576 , 0.75206405,
        0.7535064 , 0.75435387, 0.76309085, 0.76735666, 0.77140117,
        0.77439913, 0.77702458, 0.78016313, 0.78286103, 0.78565483]]), array([[ 30,  70, 523, 269, 205, 213,  31, 388, 149, 275, 450, 303, 528,
        363, 371, 349, 170, 455, 384, 268, 569, 336,  77,  83, 346, 119,
        276, 313, 150, 219]]))


In [7]:
print(movie_similarity_model.kneighbors(movie_by_user[7,].drop('movieId').to_numpy(),n_neighbors=30))

(array([[0.        , 0.3305461 , 0.340173  , 0.35464099, 0.35691009,
        0.37237884, 0.37266198, 0.4016587 , 0.40257025, 0.4026831 ,
        0.40809108, 0.40957609, 0.42769957, 0.43196691, 0.43407968,
        0.43573732, 0.43931796, 0.44860449, 0.46656139, 0.46906173,
        0.47302041, 0.47342152, 0.47423354, 0.48286965, 0.48640309,
        0.48719359, 0.48843345, 0.49157234, 0.49848574, 0.49983288]]), array([[  7,  26, 478,  25,  20,  16, 472, 463,  32, 232,  33,   3,  34,
         17,  22, 121, 621, 549, 464,  42,  19, 587, 166,  28, 531, 471,
        633,  73, 322, 539]]))


In [10]:
movie_by_user.filter(pl.col("movieId").is_in([  7,  26, 478,  25,  20,  16, 472, 463,  32, 232,  33,   3,  34,
         17,  22, 121, 621, 549, 464,  42,  19, 587, 166,  28, 531, 471,
        633,  73, 322, 539]))

movieId,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,…,column_573,column_574,column_575,column_576,column_577,column_578,column_579,column_580,column_581,column_582,column_583,column_584,column_585,column_586,column_587,column_588,column_589,column_590,column_591,column_592,column_593,column_594,column_595,column_596,column_597,column_598,column_599,column_600,column_601,column_602,column_603,column_604,column_605,column_606,column_607,column_608,column_609
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
32,0.0,0.0,0.0,2.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,3.5,3.5,0.0,0.0,0.0,3.5,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,3.0,4.5,0.0,3.0,3.0,4.0,0.0,4.0,0.0,3.5,0.0,4.5
232,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
539,0.0,0.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.5,4.0,0.0,0.0,0.0,0.0,5.0,4.0,5.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0
34,0.0,0.0,0.0,0.0,4.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,4.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,3.0,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,2.0,0.0,1.0,4.0,4.0,0.0,0.0,3.0,3.5,0.0,0.0
531,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.5,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.5,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,1.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.5,3.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,4.5,0.0,4.5
17,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,3.5,3.5,0.0,0.0,3.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0
