In [2]:
import pyarrow.parquet as pq
import pandas as pd
import os
import numpy as np

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [4]:
os.listdir(os.path.join('.', 'parquets'))

['movies_df', 'movie_title_df', 'netflix_df', 'users_df']

In [5]:
# create dfs from parquet files
netflix_df = pd.read_parquet('parquets/netflix_df')
movie_titles_df = pd.read_parquet('parquets/movie_title_df')
movies_df = pd.read_parquet('parquets/movies_df')
users_df = pd.read_parquet('parquets/users_df')

In [6]:
# merge movies and reviews from movielens dataset
movielens_df = pd.merge(movies_df,users_df,on='movieId',how='outer')

In [7]:
movielens_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year,userId,rating,tag,date,timestamp_tag
0,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,,2000-07-30,
1,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,,1996-11-08,
2,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5,,2005-01-25,
3,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5,,2017-11-13,
4,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5,,2011-05-18,
...,...,...,...,...,...,...,...,...,...,...,...
102879,193581,5476944,432131.0,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,184,4.0,,2018-09-16,
102880,193583,5914996,445030.0,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,184,3.5,,2018-09-16,
102881,193585,6397426,479308.0,Flint,Drama,2017,184,3.5,,2018-09-16,
102882,193587,8391976,483455.0,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,184,3.5,,2018-09-16,


To work around the problem of working with millions of rows, we will append the movie review timestamps and ratings into dicitionaries per movieid. This way, we can work with less rows which will increase the ease of use with the data without taking chunks:

#### Movielens dataset:

In [8]:
# # create a new column 'user_data' filled with dictionaries
# movielens_df['review_data'] = movielens_df.apply(
#     lambda row: {'userId': row['userId'],
#                                  'rating': row['rating'],
#                                   'date': row['date'],
#                                   'tag': row['tag'],
#                                   'timestamp_tag': row['timestamp_tag']},
#     axis=1
# )

# Create a new column 'user_data' filled with dictionaries
movielens_df['review_data'] = movielens_df.apply(
    lambda row: {
        'userId': row['userId'],
        'rating': row['rating'],
        'date': row['date'],
        'tag': row['tag'],
        'timestamp_tag': row['timestamp_tag']
    } if any(row[['userId', 'rating', 'date', 'tag', 'timestamp_tag']].notna()) else None,
    axis=1
)

In [9]:
movielens_df.isnull().sum()

movieId              0
imdbId               0
tmdbId              13
title                0
genres               0
year                20
userId               0
rating             207
tag              99201
date               207
timestamp_tag    99201
review_data          0
dtype: int64

In [10]:
movielens_df = movielens_df.drop(['userId','rating','date','tag','timestamp_tag'],axis=1)

In [11]:
# define aggregation functions
agg_funcs = {
    'title': 'first',
    'genres': 'first',
    'year': 'first',
    'review_data': lambda x: {idx: val for idx, val in enumerate(x) if pd.notnull(val)} or None,
    'imdbId': 'first',
    'tmdbId': 'first',
}

# group by 'movieId' and aggregate
movielens_df = movielens_df.groupby('movieId').agg(agg_funcs).reset_index()

In [12]:
movielens_df

Unnamed: 0,movieId,title,genres,year,review_data,imdbId,tmdbId
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"{0: {'userId': 1, 'rating': 4.0, 'date': 2000-...",114709,862.0
1,2,Jumanji,Adventure|Children|Fantasy,1995,"{0: {'userId': 6, 'rating': 4.0, 'date': 1996-...",113497,8844.0
2,3,Grumpier Old Men,Comedy|Romance,1995,"{0: {'userId': 1, 'rating': 4.0, 'date': 2000-...",113228,15602.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,"{0: {'userId': 6, 'rating': 3.0, 'date': 1996-...",114885,31357.0
4,5,Father of the Bride Part II,Comedy,1995,"{0: {'userId': 6, 'rating': 5.0, 'date': 1996-...",113041,11862.0
...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,"{0: {'userId': 184, 'rating': 4.0, 'date': 201...",5476944,432131.0
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,"{0: {'userId': 184, 'rating': 3.5, 'date': 201...",5914996,445030.0
9739,193585,Flint,Drama,2017,"{0: {'userId': 184, 'rating': 3.5, 'date': 201...",6397426,479308.0
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,"{0: {'userId': 184, 'rating': 3.5, 'date': 201...",8391976,483455.0


In [13]:
# convert genres column to a dictionary
movielens_df['genres'] = movielens_df['genres'].str.split('|').apply(lambda x: {idx: val for idx, val in enumerate(x) if pd.notnull(val)} or None)

In [14]:
movielens_df.isnull().sum()
movielens_df

movieId         0
title           0
genres          0
year           13
review_data     0
imdbId          0
tmdbId          8
dtype: int64

Unnamed: 0,movieId,title,genres,year,review_data,imdbId,tmdbId
0,1,Toy Story,"{0: 'Adventure', 1: 'Animation', 2: 'Children'...",1995,"{0: {'userId': 1, 'rating': 4.0, 'date': 2000-...",114709,862.0
1,2,Jumanji,"{0: 'Adventure', 1: 'Children', 2: 'Fantasy'}",1995,"{0: {'userId': 6, 'rating': 4.0, 'date': 1996-...",113497,8844.0
2,3,Grumpier Old Men,"{0: 'Comedy', 1: 'Romance'}",1995,"{0: {'userId': 1, 'rating': 4.0, 'date': 2000-...",113228,15602.0
3,4,Waiting to Exhale,"{0: 'Comedy', 1: 'Drama', 2: 'Romance'}",1995,"{0: {'userId': 6, 'rating': 3.0, 'date': 1996-...",114885,31357.0
4,5,Father of the Bride Part II,{0: 'Comedy'},1995,"{0: {'userId': 6, 'rating': 5.0, 'date': 1996-...",113041,11862.0
...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"{0: 'Action', 1: 'Animation', 2: 'Comedy', 3: ...",2017,"{0: {'userId': 184, 'rating': 4.0, 'date': 201...",5476944,432131.0
9738,193583,No Game No Life: Zero,"{0: 'Animation', 1: 'Comedy', 2: 'Fantasy'}",2017,"{0: {'userId': 184, 'rating': 3.5, 'date': 201...",5914996,445030.0
9739,193585,Flint,{0: 'Drama'},2017,"{0: {'userId': 184, 'rating': 3.5, 'date': 201...",6397426,479308.0
9740,193587,Bungo Stray Dogs: Dead Apple,"{0: 'Action', 1: 'Animation'}",2018,"{0: {'userId': 184, 'rating': 3.5, 'date': 201...",8391976,483455.0


#### Netflix Prize dataset:

In [15]:
netflix_df['date'] = (pd.to_datetime(netflix_df['date'])).dt.date

In [16]:
# Extracting arrays directly from DataFrame
user_ids = netflix_df['userId'].values
ratings = netflix_df['rating'].values
dates = netflix_df['date'].values

data = []
current_movie_id = None

# Convert user_ids to a pandas Series to use the .str accessor
user_ids_series = pd.Series(user_ids)

# Find movieIds based on them ending with a colon, strip anything but the last index off, 
# and append the rest of the data in tuples to order everything per movieid
movie_indices = np.where(user_ids_series.str.endswith(':'))[0]
for i, idx in enumerate(movie_indices):
    if i == len(movie_indices) - 1:
        next_idx = len(netflix_df)
    else:
        next_idx = movie_indices[i+1]
    current_movie_id = user_ids[idx].split(':')[0]
    user_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'userId'].tolist())}
    rating_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'rating'].tolist())}
    date_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'date'].tolist())}
    data.append({
        'movieId': current_movie_id,
        'userId': user_data,
        'rating': rating_data,
        'date': date_data
    })

# Create DataFrame from processed data
netflix_df = pd.DataFrame(data, columns=['movieId', 'userId', 'rating', 'date'])

Doing this the id column will be an object, needs to be integer for merging with movie titles:

In [17]:
# convert movieId to integers
netflix_df['movieId'] = netflix_df['movieId'].astype(int)

In [18]:
# merge with movietitles to add title and year
netflix_df = pd.merge(netflix_df,movie_titles_df,on='movieId',how='left')

In [19]:
# # Create a new column to store the combined dictionaries
# netflix_df['review_data'] = None

# # Iterate through the rows
# for index, row in netflix_df.iterrows():
#     combined_dict = {}
#     # Iterate through the keys in the userId column
#     for key in row['userId']:
#         combined_dict[key] = {
#             'userId': row['userId'][key],
#             'rating': row['rating'][key],
#             'date': row['date'][key]
#         }
#     netflix_df.at[index, 'review_data'] = combined_dict

# Create a new column to store the combined dictionaries
netflix_df['review_data'] = None

# Iterate through the rows
for index, row in netflix_df.iterrows():
    combined_dict = {}
    all_none = True  # Flag to check if all values in the dictionary are None
    # Iterate through the keys in the userId column
    for key in row['userId']:
        if row['userId'][key] is not None or row['rating'][key] is not None or row['date'][key] is not None:
            all_none = False
        combined_dict[key] = {
            'userId': row['userId'][key],
            'rating': row['rating'][key],
            'date': row['date'][key]
        }
    # If all values in the dictionary are None, set the cell value to None
    if all_none:
        netflix_df.at[index, 'review_data'] = None
    else:
        netflix_df.at[index, 'review_data'] = combined_dict

In [20]:
netflix_df = netflix_df.drop(['userId','rating','date'],axis=1)

In [21]:
netflix_df.isnull().sum()

movieId        0
year           0
title          0
review_data    0
dtype: int64

In [22]:
netflix_df['review_data'].iloc[1]
movielens_df['review_data'].iloc[1]

{0: {'userId': '2059652', 'rating': 4.0, 'date': datetime.date(2005, 9, 5)},
 1: {'userId': '1666394', 'rating': 3.0, 'date': datetime.date(2005, 4, 19)},
 2: {'userId': '1759415', 'rating': 4.0, 'date': datetime.date(2005, 4, 22)},
 3: {'userId': '1959936', 'rating': 5.0, 'date': datetime.date(2005, 11, 21)},
 4: {'userId': '998862', 'rating': 4.0, 'date': datetime.date(2004, 11, 13)},
 5: {'userId': '2625420', 'rating': 2.0, 'date': datetime.date(2004, 12, 6)},
 6: {'userId': '573975', 'rating': 3.0, 'date': datetime.date(2005, 7, 21)},
 7: {'userId': '392722', 'rating': 4.0, 'date': datetime.date(2004, 12, 10)},
 8: {'userId': '1401650', 'rating': 4.0, 'date': datetime.date(2005, 2, 24)},
 9: {'userId': '988104', 'rating': 3.0, 'date': datetime.date(2005, 5, 23)},
 10: {'userId': '977632', 'rating': 4.0, 'date': datetime.date(2004, 11, 12)},
 11: {'userId': '2557870', 'rating': 4.0, 'date': datetime.date(2005, 3, 27)},
 12: {'userId': '1793899', 'rating': 5.0, 'date': datetime.date(

{0: {'userId': 6,
  'rating': 4.0,
  'date': datetime.date(1996, 10, 17),
  'tag': None,
  'timestamp_tag': None},
 1: {'userId': 8,
  'rating': 4.0,
  'date': datetime.date(1996, 8, 8),
  'tag': None,
  'timestamp_tag': None},
 2: {'userId': 18,
  'rating': 3.0,
  'date': datetime.date(2016, 2, 16),
  'tag': None,
  'timestamp_tag': None},
 3: {'userId': 19,
  'rating': 3.0,
  'date': datetime.date(2000, 8, 8),
  'tag': None,
  'timestamp_tag': None},
 4: {'userId': 20,
  'rating': 3.0,
  'date': datetime.date(2003, 5, 27),
  'tag': None,
  'timestamp_tag': None},
 5: {'userId': 21,
  'rating': 3.5,
  'date': datetime.date(2014, 12, 28),
  'tag': None,
  'timestamp_tag': None},
 6: {'userId': 27,
  'rating': 4.0,
  'date': datetime.date(2000, 7, 4),
  'tag': None,
  'timestamp_tag': None},
 7: {'userId': 51,
  'rating': 4.5,
  'date': datetime.date(2009, 1, 2),
  'tag': None,
  'timestamp_tag': None},
 8: {'userId': 62,
  'rating': 4.0,
  'date': datetime.date(2018, 6, 12),
  'tag': '

Dicitionaries look the same after creating so they can be concatenated

## Finalizing and converting to parquet:

To convert to parquet the keys of the dictionaries need to be converted to strings, otherwise the conversion to parquet does not work. Afterwards to Netflix and Movielens dataframes will be converted to parquet files.

In [37]:
netflix_df.columns
movielens_df.columns

Index(['movieId', 'year', 'title', 'review_data'], dtype='object')

Index(['movieId', 'title', 'genres', 'year', 'review_data', 'imdbId',
       'tmdbId'],
      dtype='object')

In [38]:
# conversion of keys to strings
movielens_df['genres'] = movielens_df['genres'].apply(lambda x: {str(k): v for k, v in x.items()} if isinstance(x, dict) else {})
movielens_df['review_data']= movielens_df['review_data'].apply(lambda x: {str(k): str(v) for k, v in x.items()} if isinstance(x, dict) else {})
netflix_df['review_data'] = netflix_df['review_data'].apply(lambda x: {str(k): str(v) for k, v in x.items()} if isinstance(x, dict) else {})

In [39]:
# convert to parquet
netflix_df.to_parquet('cleaned/netflix_parquet')
movielens_df.to_parquet('cleaned/movielens_parquet')

In [40]:
# netflix_movielens_df = pd.concat([netflix_df,movielens_df],ignore_index=True)

# # show to check afterwards if the indices are not replaced after concatenated dictionaries
# netflix_movielens_df[netflix_movielens_df['movieId']==1]

In [41]:
# def agg_concat_dicts_np(series):
#     # Convert series of dictionaries to a list of dictionaries
#     list_of_dicts = list(series)
#     # Merge dictionaries
#     merged_dict = {}
#     key_counter = 0
#     for d in list_of_dicts:
#         if d is not None:
#             for value in d.values():
#                 merged_dict[str(key_counter)] = value
#                 key_counter += 1
#     return merged_dict

# # group by 'movieId' and aggregate using the custom function
# recsys_df = netflix_movielens_df.groupby('movieId').agg({
#     'title' : 'first',
#     'year' : 'first',
#     'genres' : 'first',
#     'imdbId' : 'max',
#     'tmdbId' : 'max',
#     'review_data': agg_concat_dicts_np,
# }).reset_index()

In [42]:
# recsys_df.isnull().sum()
# recsys_df

In [43]:
# # convert integer keys into strings in the columns with dictionaries
# recsys_df['genres'] = recsys_df['genres'].apply(lambda x: {str(k): v for k, v in x.items()} if isinstance(x, dict) else {})
# recsys_df['review_data'] = recsys_df['review_data'].apply(lambda x: {str(k): str(v) for k, v in x.items()} if isinstance(x, dict) else {})

In [44]:
# # convert to parquet
# recsys_df.to_parquet('cleaned/netflix_movielens_cleaned_parquet')