In [2]:
import pyarrow.parquet as pq
import pandas as pd
import os
import numpy as np

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [4]:
os.listdir(os.path.join('.', 'parquets'))

['movies_df', 'movie_title_df', 'netflix_df', 'users_df']

In [5]:
# create dfs from parquet files
netflix_df = pd.read_parquet('parquets/netflix_df')
movie_titles_df = pd.read_parquet('parquets/movie_title_df')
movies_df = pd.read_parquet('parquets/movies_df')
users_df = pd.read_parquet('parquets/users_df')

In [6]:
# merge movies and reviews from movielens dataset
movielens_df = pd.merge(movies_df,users_df,on='movieId',how='outer')

In [7]:
movielens_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year,userId,rating,tag,timestamp_review,timestamp_tag
0,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,,2000-07-30,
1,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,,1996-11-08,
2,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5,,2005-01-25,
3,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5,,2017-11-13,
4,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5,,2011-05-18,
...,...,...,...,...,...,...,...,...,...,...,...
102879,193581,5476944,432131.0,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,184,4.0,,2018-09-16,
102880,193583,5914996,445030.0,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,184,3.5,,2018-09-16,
102881,193585,6397426,479308.0,Flint,Drama,2017,184,3.5,,2018-09-16,
102882,193587,8391976,483455.0,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,184,3.5,,2018-09-16,


To work around the problem of working with millions of rows, we will append the movie review timestamps and ratings into dicitionaries per movieid. This way, we can work with less rows which will increase the ease of use with the data without taking chunks:

#### Movielens dataset:

In [8]:
# define aggregation functions
agg_funcs = {
    'title': 'first',
    'genres': 'first',
    'year': 'first',
    'userId': lambda x: {idx: val for idx, val in enumerate(x) if pd.notnull(val)} or None,
    'tag': lambda x: {idx: val for idx, val in enumerate(x) if pd.notnull(val)} or None,
    'rating': lambda x: {idx: val for idx, val in enumerate(x) if pd.notnull(val)} or None,
    'timestamp_review': lambda x: {idx: val for idx, val in enumerate(x) if pd.notnull(val)} or None,
    'timestamp_tag': lambda x: {idx: val for idx, val in enumerate(x) if pd.notnull(val)} or None,
    'imdbId': 'first',
    'tmdbId': 'first',
}

# group by 'movieId' and aggregate
movielens_df = movielens_df.groupby('movieId').agg(agg_funcs).reset_index()

In [9]:
# convert genres column to a dictionary
movielens_df['genres'] = movielens_df['genres'].str.split('|').apply(lambda x: {genre: True for genre in x})

In [10]:
movielens_df

Unnamed: 0,movieId,title,genres,year,userId,tag,rating,timestamp_review,timestamp_tag,imdbId,tmdbId
0,1,Toy Story,"{'Adventure': True, 'Animation': True, 'Childr...",1995,"{0: 1, 1: 5, 2: 7, 3: 15, 4: 17, 5: 18, 6: 19,...","{121: 'pixar', 164: 'pixar', 193: 'fun'}","{0: 4.0, 1: 4.0, 2: 4.5, 3: 2.5, 4: 4.5, 5: 3....","{0: 2000-07-30, 1: 1996-11-08, 2: 2005-01-25, ...","{121: 2006-02-04, 164: 2006-01-14, 193: 2018-0...",114709,862.0
1,2,Jumanji,"{'Adventure': True, 'Children': True, 'Fantasy...",1995,"{0: 6, 1: 8, 2: 18, 3: 19, 4: 20, 5: 21, 6: 27...","{8: 'fantasy', 9: 'magic board game', 10: 'Rob...","{0: 4.0, 1: 4.0, 2: 3.0, 3: 3.0, 4: 3.0, 5: 3....","{0: 1996-10-17, 1: 1996-08-08, 2: 2016-02-16, ...","{8: 2018-06-12, 9: 2018-06-12, 10: 2018-06-12,...",113497,8844.0
2,3,Grumpier Old Men,"{'Comedy': True, 'Romance': True}",1995,"{0: 1, 1: 6, 2: 19, 3: 32, 4: 42, 5: 43, 6: 44...","{26: 'moldy', 27: 'old'}","{0: 4.0, 1: 5.0, 2: 3.0, 3: 3.0, 4: 4.0, 5: 5....","{0: 2000-07-30, 1: 1996-10-17, 2: 2000-08-08, ...","{26: 2006-03-27, 27: 2006-03-27}",113228,15602.0
3,4,Waiting to Exhale,"{'Comedy': True, 'Drama': True, 'Romance': True}",1995,"{0: 6, 1: 14, 2: 84, 3: 162, 4: 262, 5: 411, 6...",,"{0: 3.0, 1: 3.0, 2: 3.0, 3: 3.0, 4: 1.0, 5: 2....","{0: 1996-10-17, 1: 1996-06-22, 2: 1997-03-19, ...",,114885,31357.0
4,5,Father of the Bride Part II,{'Comedy': True},1995,"{0: 6, 1: 31, 2: 43, 3: 45, 4: 58, 5: 66, 6: 6...","{37: 'pregnancy', 38: 'remake'}","{0: 5.0, 1: 3.0, 2: 5.0, 3: 3.0, 4: 4.0, 5: 4....","{0: 1996-10-17, 1: 1996-12-13, 2: 1996-11-26, ...","{37: 2006-01-16, 38: 2006-01-16}",113041,11862.0
...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"{'Action': True, 'Animation': True, 'Comedy': ...",2017,{0: 184},,{0: 4.0},{0: 2018-09-16},,5476944,432131.0
9738,193583,No Game No Life: Zero,"{'Animation': True, 'Comedy': True, 'Fantasy':...",2017,{0: 184},,{0: 3.5},{0: 2018-09-16},,5914996,445030.0
9739,193585,Flint,{'Drama': True},2017,{0: 184},,{0: 3.5},{0: 2018-09-16},,6397426,479308.0
9740,193587,Bungo Stray Dogs: Dead Apple,"{'Action': True, 'Animation': True}",2018,{0: 184},,{0: 3.5},{0: 2018-09-16},,8391976,483455.0


#### Netflix Prize dataset:

In [11]:
# Extracting arrays directly from DataFrame
user_ids = netflix_df['userId'].values
ratings = netflix_df['rating'].values
dates = netflix_df['date'].values

data = []
current_movie_id = None

# Convert user_ids to a pandas Series to use the .str accessor
user_ids_series = pd.Series(user_ids)

# Find movieIds based on them ending with a colon, strip anything but the last index off, 
# and append the rest of the data in tuples to order everything per movieid
movie_indices = np.where(user_ids_series.str.endswith(':'))[0]
for i, idx in enumerate(movie_indices):
    if i == len(movie_indices) - 1:
        next_idx = len(netflix_df)
    else:
        next_idx = movie_indices[i+1]
    current_movie_id = user_ids[idx].split(':')[0]
    user_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'userId'].tolist())}
    rating_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'rating'].tolist())}
    date_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'date'].tolist())}
    data.append({
        'movieId': current_movie_id,
        'userId': user_data,
        'rating': rating_data,
        'date': date_data
    })

# Create DataFrame from processed data
netflix_df = pd.DataFrame(data, columns=['movieId', 'userId', 'rating', 'date'])

Doing this the id column will be an object, needs to be integer for merging with movie titles:

In [12]:
netflix_df
netflix_df.dtypes

Unnamed: 0,movieId,userId,rating,date
0,1,"{0: '1488844', 1: '822109', 2: '885013', 3: '3...","{0: 3.0, 1: 5.0, 2: 4.0, 3: 4.0, 4: 3.0, 5: 3....","{0: '2005-09-06', 1: '2005-05-13', 2: '2005-10..."
1,2,"{0: '2059652', 1: '1666394', 2: '1759415', 3: ...","{0: 4.0, 1: 3.0, 2: 4.0, 3: 5.0, 4: 4.0, 5: 2....","{0: '2005-09-05', 1: '2005-04-19', 2: '2005-04..."
2,3,"{0: '1025579', 1: '712664', 2: '1331154', 3: '...","{0: 4.0, 1: 5.0, 2: 4.0, 3: 3.0, 4: 5.0, 5: 4....","{0: '2003-03-29', 1: '2004-02-01', 2: '2004-07..."
3,4,"{0: '1065039', 1: '1544320', 2: '410199', 3: '...","{0: 3.0, 1: 1.0, 2: 5.0, 3: 3.0, 4: 1.0, 5: 1....","{0: '2005-09-06', 1: '2004-06-28', 2: '2004-10..."
4,5,"{0: '1745265', 1: '885013', 2: '1997470', 3: '...","{0: 5.0, 1: 5.0, 2: 5.0, 3: 1.0, 4: 4.0, 5: 5....","{0: '2005-02-08', 1: '2005-05-15', 2: '2005-05..."
...,...,...,...,...
17765,17766,"{0: '1729814', 1: '888595', 2: '1388502', 3: '...","{0: 4.0, 1: 3.0, 2: 2.0, 3: 4.0, 4: 3.0, 5: 1....","{0: '2004-04-22', 1: '2005-12-07', 2: '2003-08..."
17766,17767,"{0: '1428688', 1: '656399', 2: '1356914', 3: '...","{0: 3.0, 1: 3.0, 2: 4.0, 3: 4.0, 4: 3.0, 5: 4....","{0: '2005-08-09', 1: '2005-08-19', 2: '2005-05..."
17767,17768,"{0: '745445', 1: '1018579', 2: '400164', 3: '2...","{0: 3.0, 1: 2.0, 2: 1.0, 3: 2.0, 4: 3.0, 5: 5....","{0: '2004-11-27', 1: '2005-07-28', 2: '2003-06..."
17768,17769,"{0: '1844276', 1: '2289614', 2: '2541216', 3: ...","{0: 1.0, 1: 2.0, 2: 5.0, 3: 2.0, 4: 1.0, 5: 4....","{0: '2004-07-16', 1: '2004-06-06', 2: '2005-07..."


movieId    object
userId     object
rating     object
date       object
dtype: object

In [13]:
# convert movieId to integers
netflix_df['movieId'] = netflix_df['movieId'].astype(int)

In [14]:
# merge with movietitles to add title and year
netflix_df = pd.merge(netflix_df,movie_titles_df,on='movieId',how='left')
netflix_df

Unnamed: 0,movieId,userId,rating,date,year,title
0,1,"{0: '1488844', 1: '822109', 2: '885013', 3: '3...","{0: 3.0, 1: 5.0, 2: 4.0, 3: 4.0, 4: 3.0, 5: 3....","{0: '2005-09-06', 1: '2005-05-13', 2: '2005-10...",2003,Dinosaur Planet
1,2,"{0: '2059652', 1: '1666394', 2: '1759415', 3: ...","{0: 4.0, 1: 3.0, 2: 4.0, 3: 5.0, 4: 4.0, 5: 2....","{0: '2005-09-05', 1: '2005-04-19', 2: '2005-04...",2004,Isle of Man TT 2004 Review
2,3,"{0: '1025579', 1: '712664', 2: '1331154', 3: '...","{0: 4.0, 1: 5.0, 2: 4.0, 3: 3.0, 4: 5.0, 5: 4....","{0: '2003-03-29', 1: '2004-02-01', 2: '2004-07...",1997,Character
3,4,"{0: '1065039', 1: '1544320', 2: '410199', 3: '...","{0: 3.0, 1: 1.0, 2: 5.0, 3: 3.0, 4: 1.0, 5: 1....","{0: '2005-09-06', 1: '2004-06-28', 2: '2004-10...",1994,Paula Abdul's Get Up & Dance
4,5,"{0: '1745265', 1: '885013', 2: '1997470', 3: '...","{0: 5.0, 1: 5.0, 2: 5.0, 3: 1.0, 4: 4.0, 5: 5....","{0: '2005-02-08', 1: '2005-05-15', 2: '2005-05...",2004,The Rise and Fall of ECW
...,...,...,...,...,...,...
17765,17766,"{0: '1729814', 1: '888595', 2: '1388502', 3: '...","{0: 4.0, 1: 3.0, 2: 2.0, 3: 4.0, 4: 3.0, 5: 1....","{0: '2004-04-22', 1: '2005-12-07', 2: '2003-08...",2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,"{0: '1428688', 1: '656399', 2: '1356914', 3: '...","{0: 3.0, 1: 3.0, 2: 4.0, 3: 4.0, 4: 3.0, 5: 4....","{0: '2005-08-09', 1: '2005-08-19', 2: '2005-05...",2004,Fidel Castro: American Experience
17767,17768,"{0: '745445', 1: '1018579', 2: '400164', 3: '2...","{0: 3.0, 1: 2.0, 2: 1.0, 3: 2.0, 4: 3.0, 5: 5....","{0: '2004-11-27', 1: '2005-07-28', 2: '2003-06...",2000,Epoch
17768,17769,"{0: '1844276', 1: '2289614', 2: '2541216', 3: ...","{0: 1.0, 1: 2.0, 2: 5.0, 3: 2.0, 4: 1.0, 5: 4....","{0: '2004-07-16', 1: '2004-06-06', 2: '2005-07...",2003,The Company


## Joining Netflix and Movielens data:

Now, I will outer join Netflix and Movielens by movieId, year and title. Ratings and timestamps for reviews will be appended in each list:

In [15]:
netflix_df.columns
movielens_df.columns
movielens_df = movielens_df.rename(columns={'timestamp_review': 'date'})

Index(['movieId', 'userId', 'rating', 'date', 'year', 'title'], dtype='object')

Index(['movieId', 'title', 'genres', 'year', 'userId', 'tag', 'rating',
       'timestamp_review', 'timestamp_tag', 'imdbId', 'tmdbId'],
      dtype='object')

The netflix and movielens dataframe have the same dynamics in terms of columns now, including some extras about scores, tags and timestamp of the given tag. Both dfs will be appended and then grouped by movieId, where the userId, rating and date lists will be appended with the new results from the movielens dataframe:

In [16]:
netflix_movielens_df = pd.concat([netflix_df,movielens_df],ignore_index=True)

# show to check afterwards if the indices are not replaced after concatenated dictionaries
netflix_movielens_df[netflix_movielens_df['movieId']==1]

Unnamed: 0,movieId,userId,rating,date,year,title,genres,tag,timestamp_tag,imdbId,tmdbId
0,1,"{0: '1488844', 1: '822109', 2: '885013', 3: '3...","{0: 3.0, 1: 5.0, 2: 4.0, 3: 4.0, 4: 3.0, 5: 3....","{0: '2005-09-06', 1: '2005-05-13', 2: '2005-10...",2003,Dinosaur Planet,,,,,
17770,1,"{0: 1, 1: 5, 2: 7, 3: 15, 4: 17, 5: 18, 6: 19,...","{0: 4.0, 1: 4.0, 2: 4.5, 3: 2.5, 4: 4.5, 5: 3....","{0: 2000-07-30, 1: 1996-11-08, 2: 2005-01-25, ...",1995,Toy Story,"{'Adventure': True, 'Animation': True, 'Childr...","{121: 'pixar', 164: 'pixar', 193: 'fun'}","{121: 2006-02-04, 164: 2006-01-14, 193: 2018-0...",114709.0,862.0


In [17]:
def agg_concat_dicts_np(series):
    # Convert series of dictionaries to a list of dictionaries
    list_of_dicts = list(series)
    # Merge dictionaries
    merged_dict = {}
    for idx, d in enumerate(list_of_dicts):
        if d is not None:
            for key, value in d.items():
                new_key = f'{key}_{idx}'  # Create new key based on original key and index
                merged_dict[new_key] = value
    return merged_dict

# group by 'movieId' and aggregate using the custom function
recsys_df = netflix_movielens_df.groupby('movieId').agg({
    'title' : 'first',
    'year' : 'first',
    'genres' : 'first',
    'imdbId' : 'max',
    'tmdbId' : 'max',
    'userId': agg_concat_dicts_np,
    'rating': agg_concat_dicts_np,
    'date': agg_concat_dicts_np,
    'tag' : 'first',
    'timestamp_tag' : 'first'
}).reset_index()

MemoryError: 

In [None]:
recsys_df.isnull().sum()
recsys_df

movieId              0
userId               0
rating              18
date                18
year                13
title                0
genres           17770
tag              25940
timestamp_tag    25940
imdbId           17770
tmdbId           17778
dtype: int64

Unnamed: 0,movieId,userId,rating,date,year,title,genres,tag,timestamp_tag,imdbId,tmdbId
0,1,"{0: '1488844', 1: '822109', 2: '885013', 3: '3...","{0: 3.0, 1: 5.0, 2: 4.0, 3: 4.0, 4: 3.0, 5: 3....","{0: '2005-09-06', 1: '2005-05-13', 2: '2005-10...",2003,Dinosaur Planet,,,,,
1,2,"{0: '2059652', 1: '1666394', 2: '1759415', 3: ...","{0: 4.0, 1: 3.0, 2: 4.0, 3: 5.0, 4: 4.0, 5: 2....","{0: '2005-09-05', 1: '2005-04-19', 2: '2005-04...",2004,Isle of Man TT 2004 Review,,,,,
2,3,"{0: '1025579', 1: '712664', 2: '1331154', 3: '...","{0: 4.0, 1: 5.0, 2: 4.0, 3: 3.0, 4: 5.0, 5: 4....","{0: '2003-03-29', 1: '2004-02-01', 2: '2004-07...",1997,Character,,,,,
3,4,"{0: '1065039', 1: '1544320', 2: '410199', 3: '...","{0: 3.0, 1: 1.0, 2: 5.0, 3: 3.0, 4: 1.0, 5: 1....","{0: '2005-09-06', 1: '2004-06-28', 2: '2004-10...",1994,Paula Abdul's Get Up & Dance,,,,,
4,5,"{0: '1745265', 1: '885013', 2: '1997470', 3: '...","{0: 5.0, 1: 5.0, 2: 5.0, 3: 1.0, 4: 4.0, 5: 5....","{0: '2005-02-08', 1: '2005-05-15', 2: '2005-05...",2004,The Rise and Fall of ECW,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
27507,193581,{0: 184},{0: 4.0},{0: 2018-09-16 14:44:42},2017,Black Butler: Book of the Atlantic,"{'Action': True, 'Animation': True, 'Comedy': ...",,,5476944.0,432131.0
27508,193583,{0: 184},{0: 3.5},{0: 2018-09-16 14:52:25},2017,No Game No Life: Zero,"{'Animation': True, 'Comedy': True, 'Fantasy':...",,,5914996.0,445030.0
27509,193585,{0: 184},{0: 3.5},{0: 2018-09-16 14:56:45},2017,Flint,{'Drama': True},,,6397426.0,479308.0
27510,193587,{0: 184},{0: 3.5},{0: 2018-09-16 15:00:21},2018,Bungo Stray Dogs: Dead Apple,"{'Action': True, 'Animation': True}",,,8391976.0,483455.0
