In [24]:
import pyarrow.parquet as pq
import pandas as pd
import os
import numpy as np

In [25]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [26]:
os.listdir(os.path.join('.', 'parquets'))

['movies_df', 'movie_title_df', 'netflix_df', 'users_df']

In [27]:
# create dfs from parquet files
netflix_df = pd.read_parquet('parquets/netflix_df')
movie_titles_df = pd.read_parquet('parquets/movie_title_df')
movies_df = pd.read_parquet('parquets/movies_df')
users_df = pd.read_parquet('parquets/users_df')

In [28]:
# merge movies and reviews from movielens dataset
movielens_df = pd.merge(movies_df,users_df,on='movieId',how='outer')

In [29]:
movielens_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year,userId,rating,tag,timestamp_review,timestamp_tag
0,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,,2000-07-30 18:45:03,NaT
1,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,,1996-11-08 06:36:02,NaT
2,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5,,2005-01-25 06:52:26,NaT
3,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5,,2017-11-13 12:59:30,NaT
4,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5,,2011-05-18 05:28:03,NaT
...,...,...,...,...,...,...,...,...,...,...,...
102879,193581,5476944,432131.0,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,184,4.0,,2018-09-16 14:44:42,NaT
102880,193583,5914996,445030.0,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,184,3.5,,2018-09-16 14:52:25,NaT
102881,193585,6397426,479308.0,Flint,Drama,2017,184,3.5,,2018-09-16 14:56:45,NaT
102882,193587,8391976,483455.0,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,184,3.5,,2018-09-16 15:00:21,NaT


To work around the problem of working with millions of rows, we will append the movie review timestamps and ratings into lists per movieid. This way, we can work with less rows which will increase the ease of use with the data without taking chunks:

#### Movielens dataset:

In [30]:
# Define aggregation functions
agg_funcs = {
    'imdbId': 'first',
    'tmdbId': 'first',
    'title': 'first',
    'genres': 'first',
    'year': 'first',
    'userId': 'first',
    'tag': 'first',
    'rating': lambda x: list(x),
    'timestamp_review': lambda x: list(x),
    'timestamp_tag': lambda x: list(x)
}

# Group by 'movieId' and aggregate
movielens_df = movielens_df.groupby('movieId').agg(agg_funcs).reset_index()

In [31]:
movielens_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year,userId,tag,rating,timestamp_review,timestamp_tag
0,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,pixar,"[4.0, 4.0, 4.5, 2.5, 4.5, 3.5, 4.0, 3.5, 3.0, ...","[2000-07-30 18:45:03, 1996-11-08 06:36:02, 200...","[NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, ..."
1,2,113497,8844.0,Jumanji,Adventure|Children|Fantasy,1995,6,fantasy,"[4.0, 4.0, 3.0, 3.0, 3.0, 3.5, 4.0, 4.5, 4.0, ...","[1996-10-17 11:58:42, 1996-08-08 00:23:26, 201...","[NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, 2018-..."
2,3,113228,15602.0,Grumpier Old Men,Comedy|Romance,1995,1,moldy,"[4.0, 5.0, 3.0, 3.0, 4.0, 5.0, 3.0, 4.0, 3.0, ...","[2000-07-30 18:20:47, 1996-10-17 12:11:36, 200...","[NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, ..."
3,4,114885,31357.0,Waiting to Exhale,Comedy|Drama|Romance,1995,6,,"[3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.5]","[1996-10-17 12:12:29, 1996-06-22 11:07:33, 199...","[NaT, NaT, NaT, NaT, NaT, NaT, NaT]"
4,5,113041,11862.0,Father of the Bride Part II,Comedy,1995,6,pregnancy,"[5.0, 3.0, 5.0, 3.0, 4.0, 4.0, 2.0, 3.0, 4.0, ...","[1996-10-17 12:05:38, 1996-12-13 08:44:02, 199...","[NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, ..."
...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,5476944,432131.0,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,184,,[4.0],[2018-09-16 14:44:42],[NaT]
9738,193583,5914996,445030.0,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,184,,[3.5],[2018-09-16 14:52:25],[NaT]
9739,193585,6397426,479308.0,Flint,Drama,2017,184,,[3.5],[2018-09-16 14:56:45],[NaT]
9740,193587,8391976,483455.0,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,184,,[3.5],[2018-09-16 15:00:21],[NaT]


In [32]:
# perfrom the same for genres column
movielens_df['genres'] = movielens_df['genres'].str.split('|').apply(lambda x: list(x))

#### Netflix Prize dataset:

In [33]:
# convert DataFrame columns to NumPy arrays to increase computing performance
user_ids = np.array(netflix_df['userId'])
ratings = np.array(netflix_df['rating'])
dates = np.array(netflix_df['date'])

data = []
current_movie_id = None

# convert NumPy arrays to Pandas Series
user_ids_series = pd.Series(user_ids)

# find movieIds based on them ending with a colon, strip anything but the last indice off, append the rest of the data in tuples to order everything per movieid
movie_indices = np.where(user_ids_series.str.endswith(':'))[0]
for i, idx in enumerate(movie_indices):
    if i == len(movie_indices) - 1:
        next_idx = len(netflix_df)
    else:
        next_idx = movie_indices[i+1]
    current_movie_id = user_ids[idx].split(':')[0]
    data.append((current_movie_id, user_ids[idx+1:next_idx], ratings[idx+1:next_idx], dates[idx+1:next_idx]))

# create DataFrame from processed data
netflix_df = pd.DataFrame(data, columns=['movieId', 'userId', 'rating', 'date'])

Doing this the id column will be an object, needs to be integer for merging with movie titles:

In [34]:
netflix_df
netflix_df.dtypes

Unnamed: 0,movieId,userId,rating,date
0,1,"[1488844, 822109, 885013, 30878, 823519, 89398...","[3.0, 5.0, 4.0, 4.0, 3.0, 3.0, 4.0, 3.0, 4.0, ...","[2005-09-06, 2005-05-13, 2005-10-19, 2005-12-2..."
1,2,"[2059652, 1666394, 1759415, 1959936, 998862, 2...","[4.0, 3.0, 4.0, 5.0, 4.0, 2.0, 3.0, 4.0, 4.0, ...","[2005-09-05, 2005-04-19, 2005-04-22, 2005-11-2..."
2,3,"[1025579, 712664, 1331154, 2632461, 44937, 656...","[4.0, 5.0, 4.0, 3.0, 5.0, 4.0, 1.0, 3.0, 3.0, ...","[2003-03-29, 2004-02-01, 2004-07-03, 2005-07-2..."
3,4,"[1065039, 1544320, 410199, 732556, 1201419, 66...","[3.0, 1.0, 5.0, 3.0, 1.0, 1.0, 2.0, 4.0, 4.0, ...","[2005-09-06, 2004-06-28, 2004-10-16, 2005-10-2..."
4,5,"[1745265, 885013, 1997470, 30878, 840543, 2477...","[5.0, 5.0, 5.0, 1.0, 4.0, 5.0, 3.0, 4.0, 5.0, ...","[2005-02-08, 2005-05-15, 2005-05-30, 2005-04-2..."
...,...,...,...,...
17765,17766,"[1729814, 888595, 1388502, 781833, 1214416, 26...","[4.0, 3.0, 2.0, 4.0, 3.0, 1.0, 3.0, 4.0, 3.0, ...","[2004-04-22, 2005-12-07, 2003-08-07, 2005-01-1..."
17766,17767,"[1428688, 656399, 1356914, 1526449, 191481, 13...","[3.0, 3.0, 4.0, 4.0, 3.0, 4.0, 4.0, 2.0, 3.0, ...","[2005-08-09, 2005-08-19, 2005-05-27, 2005-10-2..."
17767,17768,"[745445, 1018579, 400164, 2628220, 1319997, 12...","[3.0, 2.0, 1.0, 2.0, 3.0, 5.0, 1.0, 3.0, 3.0, ...","[2004-11-27, 2005-07-28, 2003-06-02, 2003-02-2..."
17768,17769,"[1844276, 2289614, 2541216, 426397, 1607938, 1...","[1.0, 2.0, 5.0, 2.0, 1.0, 4.0, 4.0, 3.0, 2.0, ...","[2004-07-16, 2004-06-06, 2005-07-31, 2004-07-3..."


movieId    object
userId     object
rating     object
date       object
dtype: object

In [35]:
netflix_df['movieId'] = netflix_df['movieId'].astype(int)

In [36]:
# merge with movietitles to add title and year
netflix_df = pd.merge(netflix_df,movie_titles_df,on='movieId',how='left')
netflix_df

Unnamed: 0,movieId,userId,rating,date,year,title
0,1,"[1488844, 822109, 885013, 30878, 823519, 89398...","[3.0, 5.0, 4.0, 4.0, 3.0, 3.0, 4.0, 3.0, 4.0, ...","[2005-09-06, 2005-05-13, 2005-10-19, 2005-12-2...",2003,Dinosaur Planet
1,2,"[2059652, 1666394, 1759415, 1959936, 998862, 2...","[4.0, 3.0, 4.0, 5.0, 4.0, 2.0, 3.0, 4.0, 4.0, ...","[2005-09-05, 2005-04-19, 2005-04-22, 2005-11-2...",2004,Isle of Man TT 2004 Review
2,3,"[1025579, 712664, 1331154, 2632461, 44937, 656...","[4.0, 5.0, 4.0, 3.0, 5.0, 4.0, 1.0, 3.0, 3.0, ...","[2003-03-29, 2004-02-01, 2004-07-03, 2005-07-2...",1997,Character
3,4,"[1065039, 1544320, 410199, 732556, 1201419, 66...","[3.0, 1.0, 5.0, 3.0, 1.0, 1.0, 2.0, 4.0, 4.0, ...","[2005-09-06, 2004-06-28, 2004-10-16, 2005-10-2...",1994,Paula Abdul's Get Up & Dance
4,5,"[1745265, 885013, 1997470, 30878, 840543, 2477...","[5.0, 5.0, 5.0, 1.0, 4.0, 5.0, 3.0, 4.0, 5.0, ...","[2005-02-08, 2005-05-15, 2005-05-30, 2005-04-2...",2004,The Rise and Fall of ECW
...,...,...,...,...,...,...
17765,17766,"[1729814, 888595, 1388502, 781833, 1214416, 26...","[4.0, 3.0, 2.0, 4.0, 3.0, 1.0, 3.0, 4.0, 3.0, ...","[2004-04-22, 2005-12-07, 2003-08-07, 2005-01-1...",2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,"[1428688, 656399, 1356914, 1526449, 191481, 13...","[3.0, 3.0, 4.0, 4.0, 3.0, 4.0, 4.0, 2.0, 3.0, ...","[2005-08-09, 2005-08-19, 2005-05-27, 2005-10-2...",2004,Fidel Castro: American Experience
17767,17768,"[745445, 1018579, 400164, 2628220, 1319997, 12...","[3.0, 2.0, 1.0, 2.0, 3.0, 5.0, 1.0, 3.0, 3.0, ...","[2004-11-27, 2005-07-28, 2003-06-02, 2003-02-2...",2000,Epoch
17768,17769,"[1844276, 2289614, 2541216, 426397, 1607938, 1...","[1.0, 2.0, 5.0, 2.0, 1.0, 4.0, 4.0, 3.0, 2.0, ...","[2004-07-16, 2004-06-06, 2005-07-31, 2004-07-3...",2003,The Company


## Joining Netflix and Movielens data:

Now, I will outer join Netflix and Movielens by movieId, year and title. Ratings and timestamps for reviews will be appended in each list:

In [37]:
netflix_df.columns
movielens_df.columns

Index(['movieId', 'userId', 'rating', 'date', 'year', 'title'], dtype='object')

Index(['movieId', 'imdbId', 'tmdbId', 'title', 'genres', 'year', 'userId',
       'tag', 'rating', 'timestamp_review', 'timestamp_tag'],
      dtype='object')

In [38]:
# # Merge dataframes on 'movieId', 'year', and 'title' columns
# merged_df = pd.merge(netflix_df, movielens_df, on=['movieId', 'year', 'title'], how='inner')

# # Create separate lists for userId and rating for each dataset
# netflix_userId_rating = list(zip(netflix_df['userId'], netflix_df['rating']))
# movielens_userId_rating = list(zip(movielens_df['userId'], movielens_df['rating']))

# # Concatenate lists for 'userId' and 'rating' columns
# merged_df['userId_rating'] = merged_df.apply(lambda row: [(net_id, mov_id) for net_id, mov_id in zip(row['userId_x'], row['userId_y'])], axis=1)
# merged_df['rating'] = merged_df.apply(lambda row: [(net_rating, mov_rating) for net_rating, mov_rating in zip(row['rating_x'], row['rating_y'])], axis=1)

# # Drop unnecessary columns
# merged_df.drop(columns=['userId_x', 'userId_y', 'rating_x', 'rating_y'], inplace=True)

In [40]:
# # Merge dataframes on 'movieId', 'year', and 'title' columns
# merged_df = pd.merge(netflix_df, movielens_df, on=['movieId', 'year', 'title'], how='outer')

# # Concatenate lists for 'timestamp_review' and 'date' columns
# merged_df['timestamp_review'] = merged_df.apply(lambda row: [row['timestamp_review'], row['date']], axis=1)

# # Drop the 'date' column as it's no longer needed
# merged_df.drop(columns=['date'], inplace=True)

# # Concatenate lists for 'userId' and 'rating' columns
# merged_df['userId'] = merged_df.apply(lambda row: [row['userId_x'], row['userId_y']], axis=1)
# merged_df['rating'] = merged_df.apply(lambda row: [row['rating_x'], row['rating_y']], axis=1)

# # Drop unnecessary columns
# merged_df.drop(columns=['userId_x', 'userId_y', 'rating_x', 'rating_y'], inplace=True)

# # Optionally, you can reset the index
# merged_df.reset_index(drop=True, inplace=True)