In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

### Netflix Prize dataset import:

In [3]:
os.listdir(os.path.join('.','netflix_dataset'))

['combined_data_1.txt',
 'combined_data_2.txt',
 'combined_data_3.txt',
 'combined_data_4.txt',
 'movie_titles.csv',
 'probe.txt',
 'qualifying.txt',
 'README']

In [4]:
# read the CSV file line by line
with open(os.path.join('.', 'netflix_dataset','movie_titles.csv'), 'r', encoding='latin-1') as file:
    lines = file.readlines()

# process each line
data = []
for line in lines:
    parts = line.strip().split(',', 2)  # split the line using first two commas only, to ensure only id, year and title are catched, as they cover two commas
    if len(parts) == 3:  # this way it is checked whether its actually three parts
        # concatenate the result and append it
        data.append((parts[0],parts[1], parts[2]))

movie_title_df = pd.DataFrame(data, columns=['movieId','year','title'])

# convert id column to integer for merging later
movie_title_df['movieId'] = movie_title_df['movieId'].astype(int)

In [5]:
# get the complete list of files in the netflix directory
netflix_files = os.listdir(os.path.join('.', 'netflix_dataset'))

# make a filter which only contains the combined datafiles
combined_files = [file for file in netflix_files if 'combined' in file]

# define a generator expression to yield each file's data and make a pd dataframe where only the combined files are selected
data_generator = (pd.read_csv(os.path.join('.', 'netflix_dataset', file), sep=',', header=None, names=['userId', 'rating', 'date'], nrows=1000000) for file in combined_files)

# concatenate the result
netflix_df = pd.concat(data_generator, ignore_index=True)

### MovieLens data import:

In [6]:
# import each csv
df_links = pd.read_csv('movielens_dataset/links.csv',sep=',')
df_movies = pd.read_csv('movielens_dataset/movies.csv',sep=',')
df_ratings = pd.read_csv('movielens_dataset/ratings.csv',sep=',')
df_tags = pd.read_csv('movielens_dataset/tags.csv',sep=',')

In [7]:
# look at each dataset individually
# df_links
# df_movies
# df_ratings
# df_tags

# by looking at the dataframes, they can be merged into two ones instead of four to create a user and movie dataframe
movies_df = pd.merge(df_links,df_movies,on='movieId',how='outer')
# merge by userId and movieId to rating, timestamp of rating, tag and timestmap of tag per user and the movie the review has been givent to
users_df = pd.merge(df_ratings,df_tags,on=['userId','movieId'],how='outer')
users_df
users_df.isnull().sum()
df_tags['userId'].nunique()
df_ratings['userId'].nunique()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y
0,1,1,4.0,9.649827e+08,,
1,1,3,4.0,9.649812e+08,,
2,1,6,4.0,9.649822e+08,,
3,1,47,5.0,9.649838e+08,,
4,1,50,5.0,9.649829e+08,,
...,...,...,...,...,...,...
102879,610,166534,4.0,1.493848e+09,,
102880,610,168248,5.0,1.493850e+09,Heroic Bloodshed,1.493844e+09
102881,610,168250,5.0,1.494273e+09,,
102882,610,168252,5.0,1.493846e+09,,


userId             0
movieId            0
rating           207
timestamp_x      207
tag            99201
timestamp_y    99201
dtype: int64

58

610

In **tag** and **timestamp_y** form **users_df** column are lots of null values, meaning that not that many users have given a tag to their review. Comparing the amount of unique users in the tags csv compared to the ratings df, it is normal to have that many null values after merging.

### Perform cleaning:

In [8]:
# create dummy columns by splitting by | and subsequently concatenating this df with the original one. Original genre column is dropped
# df_dummies = movies_df['genres'].str.get_dummies('|')
# movies_df = pd.concat([movies_df, df_dummies], axis=1)
# movies_df = movies_df.drop('genres',axis=1)

# extract number between brackets for creating release year column
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')

# cut off the last six characters to clean up the movie titles
movies_df['title'] = movies_df['title'].str[:-6]

# show cleaned result
movies_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year
0,1,114709,862.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,113497,8844.0,Jumanji,Adventure|Children|Fantasy,1995
2,3,113228,15602.0,Grumpier Old Men,Comedy|Romance,1995
3,4,114885,31357.0,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,113041,11862.0,Father of the Bride Part II,Comedy,1995
...,...,...,...,...,...,...
9737,193581,5476944,432131.0,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017
9738,193583,5914996,445030.0,No Game No Life: Zero,Animation|Comedy|Fantasy,2017
9739,193585,6397426,479308.0,Flint,Drama,2017
9740,193587,8391976,483455.0,Bungo Stray Dogs: Dead Apple,Action|Animation,2018


In [9]:
# convert to a readable datetime format
users_df['timestamp_review'] = pd.to_datetime(users_df['timestamp_x'],unit='s')
users_df['timestamp_tag'] = pd.to_datetime(users_df['timestamp_y'],unit='s')

# convert to day format and strip off time
users_df['timestamp_review'] = users_df['timestamp_review'].dt.date
users_df['timestamp_tag'] = users_df['timestamp_tag'].dt.date

# drop original columns
users_df = users_df.drop(['timestamp_x','timestamp_y'],axis=1)

In [10]:
checker = netflix_df['userId'].is_unique

if checker is True:
    print("Every value within movieid column of netflix data is unique.")
else:
    print('MovieId in netflix data is not unique, meaning mulitple reviews are in per movie. Therefore, the Movielens dataset can be merged with Netflix, as it will only add more reviews per movie.')

MovieId in netflix data is not unique, meaning mulitple reviews are in per movie. Therefore, the Movielens dataset can be merged with Netflix, as it will only add more reviews per movie.


Convert everything to parquet for performance purposes:

In [11]:
users_df.to_parquet('parquets/users_df')
movies_df.to_parquet('parquets/movies_df')
movie_title_df.to_parquet('parquets/movie_title_df')
netflix_df.to_parquet('parquets/netflix_df')

In [13]:
netflix_df

Unnamed: 0,userId,rating,date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
3999995,609161,2.0,2005-05-06
3999996,2024230,3.0,2005-05-07
3999997,243989,4.0,2005-05-17
3999998,1412520,5.0,2005-05-20
