In [35]:
import pandas as pd
import os
import numpy as np

In [36]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

### Netflix Prize dataset import:

In [37]:
# print the directory we are in
os.getcwd()

'C:\\Users\\Jaume\\Documents\\MDDB\\SDM\\SDfM---Jaume-and-Stijn'

In [38]:
# Set the directory to the specified path
path = r'C:\Users\Jaume\Documents\MDDB\SDM'
os.chdir(path)

# List the contents of the 'netflix_dataset' directory within the specified path
os.listdir(os.path.join(path, 'netflix_dataset'))


['combined_data_1.txt',
 'combined_data_2.txt',
 'combined_data_3.txt',
 'combined_data_4.txt',
 'movie_titles.csv',
 'probe.txt',
 'qualifying.txt',
 'README']

In [39]:
# read the CSV file line by line
with open(os.path.join('.', 'netflix_dataset','movie_titles.csv'), 'r', encoding='latin-1') as file:
    lines = file.readlines()

# process each line
data = []
for line in lines:
    parts = line.strip().split(',', 2)  # split the line using first two commas only, to ensure only id, year and title are catched, as they cover two commas
    if len(parts) == 3:  # this way it is checked whether its actually three parts
        # concatenate the result and append it
        data.append((parts[0],parts[1], parts[2]))

movie_title_df = pd.DataFrame(data, columns=['movieId','year','title'])

In [40]:
# We print the movie_title_df to see if it was read correctly
movie_title_df.head()

Unnamed: 0,movieId,year,title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [41]:
# We print the size of the dataframe
movie_title_df.shape

(17770, 3)

In [42]:
import os
import pandas as pd

# Get the complete list of files in the netflix directory
netflix_files = os.listdir(os.path.join('.', 'netflix_dataset'))

# Make a filter which only contains the combined data files
combined_files = [file for file in netflix_files if 'combined' in file]

# Define a generator expression to yield each file's data and make a pd dataframe where only the combined files are selected
data_generator = (pd.read_csv(os.path.join('.', 'netflix_dataset', file), sep=',', header=None, names=['MovieID', 'Rating', 'Date'], nrows=100000) for file in combined_files)

# Concatenate the result
netflix_df = pd.concat(data_generator, ignore_index=True)


In [43]:
# We read the head of the dataframe to see if it was read correctly
netflix_df.head()

Unnamed: 0,MovieID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


In [44]:
# We print the size of the dataframe
netflix_df.shape

(400000, 3)

### MovieLens data import:

In [45]:
import pandas as pd

# Import each CSV with only 5000 lines
df_links = pd.read_csv('movielens_dataset/links.csv', sep=',', nrows=100000)
df_movies = pd.read_csv('movielens_dataset/movies.csv', sep=',', nrows=100000)
df_ratings = pd.read_csv('movielens_dataset/ratings.csv', sep=',', nrows=100000)
df_tags = pd.read_csv('movielens_dataset/tags.csv', sep=',', nrows=100000)


In [46]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [47]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [48]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [49]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [50]:
# look at each dataset individually
# df_links
# df_movies
# df_ratings
# df_tags

# by looking at the dataframes, they can be merged into two ones instead of four to create a user and movie dataframe
movies_df = pd.merge(df_links,df_movies,on='movieId',how='outer')
# merge by userId and movieId to rating, timestamp of rating, tag and timestmap of tag per user and the movie the review has been givent to
users_df = pd.merge(df_ratings,df_tags,on=['userId','movieId'],how='outer')
users_df
users_df.isnull().sum()
df_tags['userId'].nunique()
df_ratings['userId'].nunique()

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y
0,1,296,5.0,1.147880e+09,,
1,1,306,3.5,1.147869e+09,,
2,1,307,5.0,1.147869e+09,,
3,1,665,5.0,1.147879e+09,,
4,1,899,3.5,1.147869e+09,,
...,...,...,...,...,...,...
199338,6550,33755,,,kaiju,1.527373e+09
199339,6550,33760,,,bangkok,1.527401e+09
199340,6550,33760,,,children,1.527401e+09
199341,6550,33760,,,king,1.527401e+09


userId             0
movieId            0
rating         97790
timestamp_x    97790
tag            99343
timestamp_y    99343
dtype: int64

581

757

In [51]:
movies_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,genres
0,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,113497,8844.0,Jumanji (1995),Adventure|Children|Fantasy
2,3,113228,15602.0,Grumpier Old Men (1995),Comedy|Romance
3,4,114885,31357.0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,113041,11862.0,Father of the Bride Part II (1995),Comedy


In **tag** and **timestamp_y** form **users_df** column are lots of null values, meaning that not that many users have given a tag to their review. Comparing the amount of unique users in the tags csv compared to the ratings df, it is normal to have that many null values after merging.

### Perform cleaning:

In [52]:
# Set the directory back to the folder where the file is located
os.chdir(r'C:\Users\Jaume\Documents\MDDB\SDM\SDfM---Jaume-and-Stijn')

# Print the directory we are in
print(os.getcwd())


C:\Users\Jaume\Documents\MDDB\SDM\SDfM---Jaume-and-Stijn


In [53]:
# create dummy columns by splitting by | and subsequently concatenating this df with the original one. Original genre column is dropped
df_dummies = movies_df['genres'].str.get_dummies('|')
movies_df = pd.concat([movies_df, df_dummies], axis=1)
movies_df = movies_df.drop('genres',axis=1)

# extract number between brackets for creating release year column
movies_df['rel_year'] = movies_df['title'].str.extract(r'\((\d{4})\)')

# cut off the last six characters to clean up the movie titles
movies_df['title'] = movies_df['title'].str[:-6]

# show cleaned result
movies_df

Unnamed: 0,movieId,imdbId,tmdbId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rel_year
0,1,114709,862.0,Toy Story,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1995
1,2,113497,8844.0,Jumanji,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,113228,15602.0,Grumpier Old Men,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
3,4,114885,31357.0,Waiting to Exhale,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1995
4,5,113041,11862.0,Father of the Bride Part II,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,209157,6671244,499546.0,We,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2018
62419,209159,297986,63407.0,Window of the Soul,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2001
62420,209163,6755366,553036.0,Bad Poems,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2018
62421,209169,249603,162892.0,A Girl Thing,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2001


In [54]:
# convert to a readable datetime format
users_df['timestamp_x'] = pd.to_datetime(users_df['timestamp_x'],unit='s')
users_df['timestamp_y'] = pd.to_datetime(users_df['timestamp_y'],unit='s')

In [55]:
checker = netflix_df['MovieID'].is_unique

if checker is True:
    print("Every value within movieid column of netflix data is unique.")
else:
    print('MovieId in netflix data is not unique, meaning mulitple reviews are in per movie. Therefore, the Movielens dataset can be merged with Netflix, as it will only add more reviews per movie.')

MovieId in netflix data is not unique, meaning mulitple reviews are in per movie. Therefore, the Movielens dataset can be merged with Netflix, as it will only add more reviews per movie.


Convert everything to parquet for performance purposes:

In [56]:
users_df.to_parquet('parquets/users_df')
movies_df.to_parquet('parquets/movies_df')
movie_title_df.to_parquet('parquets/movie_title_df')
netflix_df.to_parquet('parquets/netflix_df')

In [57]:
# We print the netflix_df to see if it was read correctly and how it looks like
netflix_df

Unnamed: 0,MovieID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
399995,261239,3.0,2002-06-02
399996,720145,4.0,2002-01-09
399997,942771,2.0,2002-02-21
399998,104422,5.0,2003-10-04


In [None]:
# We print the 