In [1]:
import pyarrow.parquet as pq
import pandas as pd
import os
import numpy as np

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [3]:
os.listdir(os.path.join('.', 'parquets'))

['movies_df', 'movie_title_df', 'netflix_df', 'users_df']

To work around the problem of working with millions of rows, we will group the dataset by movieId. In order to do so, ratings, timestamps and userId per review of each movie need to be appended into dictionaries. This way, each review is unique and easy to count. Also it will reduce memory usage and allow us to import more movieIds into the training data, as one movie can have multiple reviews. Furthermore, data will also be easily accessible.

For both datasets, the approach will be the same: 

- A new column will be created which combines the information of a review
- The dataset will be grouped by movieId, aggregating each dictionary into a list

#### Movielens dataset:

In [4]:
# create dfs from parquet files
netflix_df = pd.read_parquet('parquets/netflix_df')
movie_titles_df = pd.read_parquet('parquets/movie_title_df')
movies_df = pd.read_parquet('parquets/movies_df')
users_df = pd.read_parquet('parquets/users_df')

In [5]:
# merge movies and reviews from movielens dataset
movielens_df = pd.merge(movies_df,users_df,on='movieId',how='outer')

In [6]:
movielens_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year,userId,rating,tag,date,timestamp_tag
0,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,,2008-11-03,
1,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,2,5.0,,1996-06-26,
2,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.0,,2000-11-18,
3,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,10,3.0,,2015-05-03,
4,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,12,5.0,,1997-05-01,
...,...,...,...,...,...,...,...,...,...,...,...
35827120,288967,14418234,845861.0,State of Siege: Temple Attack,Action|Drama,2021,47791,3.5,,2023-07-19,
35827121,288971,11162178,878958.0,Ouija Japan,Action|Horror,2021,98408,0.5,,2023-07-19,
35827122,288975,70199,150392.0,The Men Who Made the Movies: Howard Hawks,Documentary,1973,154483,4.0,,2023-07-20,
35827123,288977,23050520,1102551.0,Skinford: Death Sentence,Crime|Thriller,2023,291389,3.0,,2023-07-20,


In [7]:
movielens_df.isnull().sum()

movieId                 0
imdbId                  0
tmdbId               5327
title                   0
genres                  0
year                42704
userId                  0
rating             599023
tag              33498827
date               599023
timestamp_tag    33498810
dtype: int64

In [8]:
# take fraction of 2 million rows, just like netflix dataset in the import dataset notebook
movielens_df = movielens_df.sample(frac=4000000 / len(movielens_df), random_state=42)

In [9]:
# drop tag and timestamp of tag, as I will not use them in my models.
movielens_df = movielens_df.drop(['tag','timestamp_tag'],axis=1)

In [10]:
movielens_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year,userId,rating,date
12039056,1912,120780,1389.0,Out of Sight,Comedy|Crime|Drama|Romance|Thriller,1998,77662,5.0,2000-10-28
35517870,219994,10272386,600354.0,The Father,Drama,2020,203660,4.5,2023-05-02
14350758,2424,128853,9489.0,You've Got Mail,Comedy|Romance,1998,6244,2.0,2000-03-06
3842029,436,109456,2124.0,Color of Night,Drama|Thriller,1994,52700,4.0,2005-06-26
10801322,1569,119738,8874.0,My Best Friend's Wedding,Comedy|Romance,1997,21324,3.0,2017-02-27
...,...,...,...,...,...,...,...,...,...
17738970,3448,93105,801.0,"Good Morning, Vietnam",Comedy|Drama|War,1987,311636,5.0,2001-02-02
5907129,750,57012,935.0,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,1964,194179,3.0,2009-03-06
4236395,491,107501,10502.0,"Man Without a Face, The",Drama,1993,114775,3.5,2004-01-24
27497949,52712,435670,9785.0,"Invisible, The",Crime|Drama|Fantasy|Mystery|Thriller,2007,4763,3.5,2017-04-17


In [11]:
# convert the needed values to NumPy arrays
user_ids = movielens_df['userId'].values
ratings = movielens_df['rating'].values
dates = movielens_df['date'].values

# set up an empty list to store dictionaries
review_data = []

# iterate over the arrays simultaneously to create dictionaries
for user_id, rating, date in zip(user_ids, ratings, dates):
    if not any(pd.isna([user_id, rating, date])):
        review_data.append({'userId': user_id, 'rating': rating, 'date': date})
    else:
        review_data.append(None)

# Add the list of dictionaries as a new column to the DataFrame
movielens_df['review_data'] = review_data

In [12]:
movielens_df.isnull().sum()

movieId            0
imdbId             0
tmdbId           594
title              0
genres             0
year            4858
userId             0
rating         67038
date           67038
review_data    67038
dtype: int64

In [13]:
# drop redundant columns as they are in the dictionary in the review_data column
movielens_df = movielens_df.drop(['userId','rating','date'],axis=1)

In [14]:
# define aggregation functions
agg_funcs = {
    'title': 'first',
    'genres': 'first',
    'year': 'first',
    'review_data': lambda x: [
        {"userId": val['userId'], "rating": val['rating'], "date": val['date']}
        for val in x if pd.notnull(val)
    ] or None,
    'imdbId': 'first',
    'tmdbId': 'first',
}

# group by 'movieId' and aggregate
movielens_df = movielens_df.groupby('movieId').agg(agg_funcs).reset_index()

In [15]:
# Create a new column to store the length of review_data in each row
movielens_df['num_reviews'] = 0

# Loop through each row and calculate the length of review_data
for i in range(len(movielens_df.head(5))):
    print(f"Row {i}: {movielens_df['review_data'].iloc[i]}")
    if isinstance(movielens_df['review_data'].iloc[i], list):
        print(f"Row {i}: review_data is a list")
        movielens_df.at[i, 'num_reviews'] = len(movielens_df['review_data'].iloc[i])
    else:
        print(f"Row {i}: review_data is not a list")
        movielens_df.at[i, 'num_reviews'] = 0

# Check the resulting DataFrame
print(movielens_df['num_reviews'])

Row 0: [{'userId': 220678, 'rating': 5.0, 'date': datetime.date(1996, 11, 9)}, {'userId': 315970, 'rating': 5.0, 'date': datetime.date(2007, 9, 19)}, {'userId': 326312, 'rating': 5.0, 'date': datetime.date(1996, 12, 12)}, {'userId': 317349, 'rating': 4.5, 'date': datetime.date(2015, 10, 19)}, {'userId': 314506, 'rating': 5.0, 'date': datetime.date(2008, 1, 20)}, {'userId': 303922, 'rating': 3.5, 'date': datetime.date(2006, 12, 29)}, {'userId': 8970, 'rating': 5.0, 'date': datetime.date(1997, 5, 2)}, {'userId': 314275, 'rating': 2.0, 'date': datetime.date(2017, 7, 1)}, {'userId': 47366, 'rating': 4.5, 'date': datetime.date(2008, 7, 3)}, {'userId': 61339, 'rating': 3.0, 'date': datetime.date(2005, 2, 16)}, {'userId': 177180, 'rating': 5.0, 'date': datetime.date(1999, 12, 11)}, {'userId': 111947, 'rating': 3.0, 'date': datetime.date(2007, 1, 29)}, {'userId': 323324, 'rating': 4.0, 'date': datetime.date(2016, 3, 24)}, {'userId': 24418, 'rating': 3.5, 'date': datetime.date(2015, 10, 1)}, {'

In [16]:
movielens_df

Unnamed: 0,movieId,title,genres,year,review_data,imdbId,tmdbId,num_reviews
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"[{'userId': 220678, 'rating': 5.0, 'date': 199...",114709,862.0,8668
1,2,Jumanji,Adventure|Children|Fantasy,1995,"[{'userId': 319226, 'rating': 3.5, 'date': 201...",113497,8844.0,3427
2,3,Grumpier Old Men,Comedy|Romance,1995,"[{'userId': 146396, 'rating': 4.0, 'date': 199...",113228,15602.0,1773
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,"[{'userId': 146001, 'rating': 3.0, 'date': 199...",114885,31357.0,356
4,5,Father of the Bride Part II,Comedy,1995,"[{'userId': 285075, 'rating': 3.0, 'date': 200...",113041,11862.0,1687
...,...,...,...,...,...,...,...,...
50624,288941,Mixed Baggage,Comedy|Drama|Romance,2023,"[{'userId': 7644, 'rating': 5.0, 'date': 2023-...",21411840,1033258.0,0
50625,288949,Eldorado: Everything the Nazis Hate,Documentary,2023,"[{'userId': 327439, 'rating': 0.5, 'date': 202...",27906298,1133922.0,0
50626,288955,Agata's Friends,Drama,2016,"[{'userId': 308174, 'rating': 2.0, 'date': 202...",4594936,338553.0,0
50627,288965,Камертон,Romance,1979,"[{'userId': 167321, 'rating': 2.5, 'date': 202...",3337244,388606.0,0


I chose to modify the genres column to tuples, as genres per movie will not change, and it is more memory efficient:

In [17]:
# convert genres column to a tuple for efficient memory usage
movielens_df['genres'] = movielens_df['genres'].str.replace('|', ',').apply(lambda x: tuple(x.split(',')) if isinstance(x, str) else None)

In [18]:
movielens_df.isnull().sum()
movielens_df

movieId           0
title             0
genres            0
year            245
review_data    4708
imdbId            0
tmdbId           62
num_reviews       0
dtype: int64

Unnamed: 0,movieId,title,genres,year,review_data,imdbId,tmdbId,num_reviews
0,1,Toy Story,"(Adventure, Animation, Children, Comedy, Fantasy)",1995,"[{'userId': 220678, 'rating': 5.0, 'date': 199...",114709,862.0,8668
1,2,Jumanji,"(Adventure, Children, Fantasy)",1995,"[{'userId': 319226, 'rating': 3.5, 'date': 201...",113497,8844.0,3427
2,3,Grumpier Old Men,"(Comedy, Romance)",1995,"[{'userId': 146396, 'rating': 4.0, 'date': 199...",113228,15602.0,1773
3,4,Waiting to Exhale,"(Comedy, Drama, Romance)",1995,"[{'userId': 146001, 'rating': 3.0, 'date': 199...",114885,31357.0,356
4,5,Father of the Bride Part II,"(Comedy,)",1995,"[{'userId': 285075, 'rating': 3.0, 'date': 200...",113041,11862.0,1687
...,...,...,...,...,...,...,...,...
50624,288941,Mixed Baggage,"(Comedy, Drama, Romance)",2023,"[{'userId': 7644, 'rating': 5.0, 'date': 2023-...",21411840,1033258.0,0
50625,288949,Eldorado: Everything the Nazis Hate,"(Documentary,)",2023,"[{'userId': 327439, 'rating': 0.5, 'date': 202...",27906298,1133922.0,0
50626,288955,Agata's Friends,"(Drama,)",2016,"[{'userId': 308174, 'rating': 2.0, 'date': 202...",4594936,338553.0,0
50627,288965,Камертон,"(Romance,)",1979,"[{'userId': 167321, 'rating': 2.5, 'date': 202...",3337244,388606.0,0


#### Netflix Prize dataset:

In [19]:
netflix_df

Unnamed: 0,userId,rating,date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
100498272,1790158,4.0,2005-11-01
100498273,1608708,3.0,2005-07-19
100498274,234275,1.0,2004-08-07
100498275,255278,4.0,2004-05-28


In [20]:
# take fraction of 10 million rows, just like netflix dataset in the import dataset notebook
netflix_df = netflix_df.head(10000000)
netflix_df

Unnamed: 0,userId,rating,date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
9999995,2242949,3.0,2005-04-06
9999996,1582098,5.0,2005-04-07
9999997,161721,4.0,2005-04-08
9999998,473462,3.0,2005-04-08


In [21]:
netflix_df['date'] = (pd.to_datetime(netflix_df['date'])).dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflix_df['date'] = (pd.to_datetime(netflix_df['date'])).dt.date


In [22]:
netflix_df.isnull().sum()
netflix_df
netflix_df[netflix_df['rating'].isnull()]
print("None values belong to movieids as rows of userid column end with colon. Therefore, they do not need to be handled.")

userId       0
rating    1962
date      1962
dtype: int64

Unnamed: 0,userId,rating,date
0,1:,,NaT
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
9999995,2242949,3.0,2005-04-06
9999996,1582098,5.0,2005-04-07
9999997,161721,4.0,2005-04-08
9999998,473462,3.0,2005-04-08


Unnamed: 0,userId,rating,date
0,1:,,NaT
548,2:,,NaT
694,3:,,NaT
2707,4:,,NaT
2850,5:,,NaT
...,...,...,...
9981398,1958:,,NaT
9981497,1959:,,NaT
9987285,1960:,,NaT
9987403,1961:,,NaT


None values belong to movieids as rows of userid column end with colon. Therefore, they do not need to be handled.


The aggregation approach in the netflix file works slightly different, because in the Netflix data the movieIds were in the userId column. The syntax below will split the movieIds from that column and create a new one. Then the dataset will be grouped per movieId and each part of the review will be appended in a dictionary first. So, we first will get a dataframe where the review parts are not joined yet.

In [23]:
# Extracting arrays directly from DataFrame
user_ids = netflix_df['userId'].values
ratings = netflix_df['rating'].values
dates = netflix_df['date'].values

data = []
current_movie_id = None

# Convert user_ids to a pandas Series to use the .str accessor
user_ids_series = pd.Series(user_ids)

# Find movieIds based on them ending with a colon, strip it off the colon and take that part, 
# and append the rest of the data in tuples to order everything per movieid
movie_indices = np.where(user_ids_series.str.endswith(':'))[0]
for i, idx in enumerate(movie_indices):
    # checks if it's not the last index. If that checks, then next_idx is set to the index of the next movie. Otherwise, it's set to the length of the DataFrame. 
    if i == len(movie_indices) - 1:
        next_idx = len(netflix_df)
    # sets the index to the id of the next movieId, assuming that the ids are growing ascendingly
    else:
        next_idx = movie_indices[i+1]
    current_movie_id = user_ids[idx].split(':')[0]
    # create corresponding dictionaries for the data that has been found
    # the data is extracted by slicing the dataframe between the current index (idx+1) and next_idx-1.
    user_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'userId'].tolist())}
    rating_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'rating'].tolist())}
    date_data = {idx: val for idx, val in enumerate(netflix_df.loc[idx+1:next_idx-1, 'date'].tolist())}
    data.append({
        'movieId': current_movie_id,
        'userId': user_data,
        'rating': rating_data,
        'date': date_data
    })

# Create DataFrame from processed data
netflix_df = pd.DataFrame(data, columns=['movieId', 'userId', 'rating', 'date'])

In [24]:
netflix_df

Unnamed: 0,movieId,userId,rating,date
0,1,"{0: '1488844', 1: '822109', 2: '885013', 3: '3...","{0: 3.0, 1: 5.0, 2: 4.0, 3: 4.0, 4: 3.0, 5: 3....","{0: 2005-09-06, 1: 2005-05-13, 2: 2005-10-19, ..."
1,2,"{0: '2059652', 1: '1666394', 2: '1759415', 3: ...","{0: 4.0, 1: 3.0, 2: 4.0, 3: 5.0, 4: 4.0, 5: 2....","{0: 2005-09-05, 1: 2005-04-19, 2: 2005-04-22, ..."
2,3,"{0: '1025579', 1: '712664', 2: '1331154', 3: '...","{0: 4.0, 1: 5.0, 2: 4.0, 3: 3.0, 4: 5.0, 5: 4....","{0: 2003-03-29, 1: 2004-02-01, 2: 2004-07-03, ..."
3,4,"{0: '1065039', 1: '1544320', 2: '410199', 3: '...","{0: 3.0, 1: 1.0, 2: 5.0, 3: 3.0, 4: 1.0, 5: 1....","{0: 2005-09-06, 1: 2004-06-28, 2: 2004-10-16, ..."
4,5,"{0: '1745265', 1: '885013', 2: '1997470', 3: '...","{0: 5.0, 1: 5.0, 2: 5.0, 3: 1.0, 4: 4.0, 5: 5....","{0: 2005-02-08, 1: 2005-05-15, 2: 2005-05-30, ..."
...,...,...,...,...
1957,1958,"{0: '1001935', 1: '353820', 2: '1848739', 3: '...","{0: 2.0, 1: 3.0, 2: 4.0, 3: 3.0, 4: 3.0, 5: 5....","{0: 2005-06-16, 1: 2005-06-16, 2: 2005-09-22, ..."
1958,1959,"{0: '1527030', 1: '2098867', 2: '765331', 3: '...","{0: 4.0, 1: 4.0, 2: 3.0, 3: 2.0, 4: 3.0, 5: 4....","{0: 2005-07-07, 1: 2005-07-12, 2: 2003-02-27, ..."
1959,1960,"{0: '261048', 1: '617814', 2: '2625420', 3: '9...","{0: 3.0, 1: 1.0, 2: 2.0, 3: 4.0, 4: 1.0, 5: 5....","{0: 2004-09-22, 1: 2004-05-11, 2: 2004-11-17, ..."
1960,1961,"{0: '573364', 1: '247794', 2: '1696725', 3: '1...","{0: 3.0, 1: 3.0, 2: 3.0, 3: 2.0, 4: 4.0, 5: 4....","{0: 2005-08-12, 1: 2005-07-27, 2: 2004-09-29, ..."


Doing this the id column will be an object, needs to be integer for merging with movie titles:

In [25]:
# convert movieId to integers
netflix_df['movieId'] = netflix_df['movieId'].astype(int)

In [26]:
# merge with movietitles to add title and year
netflix_df = pd.merge(netflix_df,movie_titles_df,on='movieId',how='left')

In [27]:
# Create a new column to store the combined dictionaries
netflix_df['review_data'] = None

# Iterate through the rows
for index, row in netflix_df.iterrows():
    combined_reviews = []  # List to store combined dictionaries for each review
    # Iterate through the rows to combine userId, rating, and date
    for i in range(len(row['userId'])):
        review_dict = {}
        # Check if any of the values are not None
        if row['userId'][i] is not None or row['rating'][i] is not None or row['date'][i] is not None:
            review_dict['userId'] = row['userId'][i]
            review_dict['rating'] = row['rating'][i]
            review_dict['date'] = row['date'][i]
            combined_reviews.append(review_dict)
    
    # Set the combined reviews to the 'review_data' column
    netflix_df.at[index, 'review_data'] = combined_reviews

In [28]:
# drop redundant columns
netflix_df = netflix_df.drop(['userId','rating','date'],axis=1)

In [29]:
# exclude anything other than numbers in the year column, as "NULL" was found in that column
netflix_df = netflix_df[netflix_df['year'].str.isdigit()]

In [30]:
netflix_df.isnull().sum()

movieId        0
year           0
title          0
review_data    0
dtype: int64

In [31]:
# convert review_data to values to extract elements from it
review_data = netflix_df['review_data'].values

# convert every element of review_data column dictionary to lists, to assess null values
ratings = [entry['rating'] for row in review_data for entry in row if 'rating' in entry]
dates = [entry['date'] for row in review_data for entry in row if 'date' in entry]
userids = [entry['userId'] for row in review_data for entry in row if 'userId' in entry]

# check for null values in every element of the review_data dictionaries
print('There are {} null values within the netflix prize movie ratings.'.format(ratings.count(None)))
print('There are {} null values within the netflix prize movie dates.'.format(dates.count(None)))
print('There are {} null values within the netflix prize movie userIds.'.format(userids.count(None)))
print("No null values that need to be handled.")

There are 0 null values within the netflix prize movie ratings.
There are 0 null values within the netflix prize movie dates.
There are 0 null values within the netflix prize movie userIds.
No null values that need to be handled.


## Joining with genres found on this gihub link:

https://github.com/tommasocarraro/netflix-prize-with-genres

In [32]:
# import the genres form the download csv from github
netflix_genres = pd.read_csv('netflix2_dataset/netflix_genres.csv')

# merge them with the netflix dataframe by performing a left join, to ensure only matches from the netflix dataset are catched
netflix_df = pd.merge(netflix_df,netflix_genres,on='movieId',how='left')

# convert genres column to a tuple to save memory
netflix_df['genres'] = netflix_df['genres'].str.replace('|', ',').apply(lambda x: tuple(x.split(',')) if isinstance(x, str) else None)

netflix_df

Unnamed: 0,movieId,year,title,review_data,genres
0,1,2003,Dinosaur Planet,"[{'userId': '1488844', 'rating': 3.0, 'date': ...","(Documentary, Animation, Family)"
1,2,2004,Isle of Man TT 2004 Review,"[{'userId': '2059652', 'rating': 4.0, 'date': ...",
2,3,1997,Character,"[{'userId': '1025579', 'rating': 4.0, 'date': ...","(Crime, Drama, Mystery)"
3,4,1994,Paula Abdul's Get Up & Dance,"[{'userId': '1065039', 'rating': 3.0, 'date': ...","(Family,)"
4,5,2004,The Rise and Fall of ECW,"[{'userId': '1745265', 'rating': 5.0, 'date': ...","(Documentary, Sport)"
...,...,...,...,...,...
1957,1958,2002,WWE: Before They Were Superstars 2,"[{'userId': '1001935', 'rating': 2.0, 'date': ...","(Documentary, Sport)"
1958,1959,1961,Splendor in the Grass,"[{'userId': '1527030', 'rating': 4.0, 'date': ...","(Drama, Romance)"
1959,1960,2003,Smack: Vol. 1,"[{'userId': '261048', 'rating': 3.0, 'date': 2...",
1960,1961,1939,Port of Shadows,"[{'userId': '573364', 'rating': 3.0, 'date': 2...",


## Finalizing and converting to parquet:

To convert to parquet the keys of the dictionaries need to be converted to strings, otherwise the conversion to parquet does not work. Afterwards to Netflix and Movielens dataframes will be converted to parquet files.

In [33]:
netflix_df.columns
movielens_df.columns
netflix_df
movielens_df

Index(['movieId', 'year', 'title', 'review_data', 'genres'], dtype='object')

Index(['movieId', 'title', 'genres', 'year', 'review_data', 'imdbId', 'tmdbId',
       'num_reviews'],
      dtype='object')

Unnamed: 0,movieId,year,title,review_data,genres
0,1,2003,Dinosaur Planet,"[{'userId': '1488844', 'rating': 3.0, 'date': ...","(Documentary, Animation, Family)"
1,2,2004,Isle of Man TT 2004 Review,"[{'userId': '2059652', 'rating': 4.0, 'date': ...",
2,3,1997,Character,"[{'userId': '1025579', 'rating': 4.0, 'date': ...","(Crime, Drama, Mystery)"
3,4,1994,Paula Abdul's Get Up & Dance,"[{'userId': '1065039', 'rating': 3.0, 'date': ...","(Family,)"
4,5,2004,The Rise and Fall of ECW,"[{'userId': '1745265', 'rating': 5.0, 'date': ...","(Documentary, Sport)"
...,...,...,...,...,...
1957,1958,2002,WWE: Before They Were Superstars 2,"[{'userId': '1001935', 'rating': 2.0, 'date': ...","(Documentary, Sport)"
1958,1959,1961,Splendor in the Grass,"[{'userId': '1527030', 'rating': 4.0, 'date': ...","(Drama, Romance)"
1959,1960,2003,Smack: Vol. 1,"[{'userId': '261048', 'rating': 3.0, 'date': 2...",
1960,1961,1939,Port of Shadows,"[{'userId': '573364', 'rating': 3.0, 'date': 2...",


Unnamed: 0,movieId,title,genres,year,review_data,imdbId,tmdbId,num_reviews
0,1,Toy Story,"(Adventure, Animation, Children, Comedy, Fantasy)",1995,"[{'userId': 220678, 'rating': 5.0, 'date': 199...",114709,862.0,8668
1,2,Jumanji,"(Adventure, Children, Fantasy)",1995,"[{'userId': 319226, 'rating': 3.5, 'date': 201...",113497,8844.0,3427
2,3,Grumpier Old Men,"(Comedy, Romance)",1995,"[{'userId': 146396, 'rating': 4.0, 'date': 199...",113228,15602.0,1773
3,4,Waiting to Exhale,"(Comedy, Drama, Romance)",1995,"[{'userId': 146001, 'rating': 3.0, 'date': 199...",114885,31357.0,356
4,5,Father of the Bride Part II,"(Comedy,)",1995,"[{'userId': 285075, 'rating': 3.0, 'date': 200...",113041,11862.0,1687
...,...,...,...,...,...,...,...,...
50624,288941,Mixed Baggage,"(Comedy, Drama, Romance)",2023,"[{'userId': 7644, 'rating': 5.0, 'date': 2023-...",21411840,1033258.0,0
50625,288949,Eldorado: Everything the Nazis Hate,"(Documentary,)",2023,"[{'userId': 327439, 'rating': 0.5, 'date': 202...",27906298,1133922.0,0
50626,288955,Agata's Friends,"(Drama,)",2016,"[{'userId': 308174, 'rating': 2.0, 'date': 202...",4594936,338553.0,0
50627,288965,Камертон,"(Romance,)",1979,"[{'userId': 167321, 'rating': 2.5, 'date': 202...",3337244,388606.0,0


In [34]:
# convert to parquet
netflix_df.to_parquet('cleaned/netflix_parquet')
movielens_df.to_parquet('cleaned/movielens_parquet')