In [1]:
import pandas as pd
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import sys
from shutil import rmtree

module_path = os.path.abspath(os.path.join('utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_loader import load_imdb_dataset

## Load IMDB dataset

In [2]:
name_basics, title_basics, title_ratings = load_imdb_dataset()

print(name_basics.shape)
print(title_basics.shape)
print(title_ratings.shape)

../data/imdb/name.basics.tsv.gz
../data/imdb/title.basics.tsv.gz
../data/imdb/title.ratings.tsv.gz
Downloading name.basics...
Extracting name.basics...
Downloading title.basics...
Extracting title.basics...
Downloading title.ratings...
Extracting title.ratings...


## Load MovieLens 1M dataset

In [4]:
data_path = 'data/'

if (not os.path.exists(data_path)):
    os.makedirs(data_path)

# Remove ml-1m if it exists in the data folder
if 'ml-1m' in os.listdir(data_path):
    rmtree(data_path + 'ml-1m')
    
dsURL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip";
print(f"Downloading {dsURL[0]}...")
urlretrieve(dsURL[0], dsURL[1])

ZipFile(dsURL[1], "r").extractall(data_path)

# Remove the zip file
os.remove(dsURL[1])

Downloading http://files.grouplens.org/datasets/movielens/ml-1m.zip...


In [5]:
users = pd.read_csv(
    data_path + 'ml-1m/users.dat', 
    sep='::',
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine="python"
)

ratings = pd.read_csv(
    data_path + "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

movies = pd.read_csv(
    data_path + "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="latin-1",
)

print("Size for users:", users.shape)
print("Size for ratings:", ratings.shape)
print("Size for movies:", movies.shape)

Size for users: (6040, 5)
Size for ratings: (1000209, 4)
Size for movies: (3883, 3)


In [41]:
# Copy that we will use throughout the project

ml_users = users.copy()
ml_ratings = ratings.copy()
ml_movies = movies.copy()

## Preprocess MovieLens 1M dataset

In [42]:
ml_users["user_id"] = ml_users["user_id"].apply(lambda x: f"user_{x}")
ml_users["age_group"] = ml_users["age_group"].apply(lambda x: f"group_{x}")
ml_users["occupation"] = ml_users["occupation"].apply(lambda x: f"occupation_{x}")

ml_movies["movie_id"] = ml_movies["movie_id"].apply(lambda x: f"movie_{x}")
ml_movies["date"] = ml_movies["title"].apply(lambda x: x[-5:-1])
ml_movies["title"] = ml_movies["title"].apply(lambda x: x[:-7])
ml_movies["original_title"] = ml_movies["title"].str.extract(r"\((.*)\)") 
ml_movies["title"] = ml_movies["title"].str.replace(r"\(.*\)", "", regex=True).str.strip()

# For all the movies title that have ", The" or ", Les" at the end, we will move it to the beginning without the comma. End remove it from the end.
ml_movies["title"] = ml_movies["title"].apply(lambda x: "The " + x[:-5] if x[-5:] == ", The" else x)
ml_movies["title"] = ml_movies["title"].apply(lambda x: "Les " + x[:-5] if x[-5:] == ", Les" else x)

# Rename movies['title'] to movies['primary_title']
ml_movies.rename(columns={"title": "primary_title"}, inplace=True)

ml_ratings["movie_id"] = ml_ratings["movie_id"].apply(lambda x: f"movie_{x}")
ml_ratings["user_id"] = ml_ratings["user_id"].apply(lambda x: f"user_{x}")
ml_ratings["rating"] = ml_ratings["rating"].apply(lambda x: float(x))

In [43]:
ml_movies["title_processed"] = ml_movies["primary_title"].apply(lambda x: x.replace(" ", "").lower())
ml_movies["title_processed"] = ml_movies["title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [44]:
genres = []
for genre in ml_movies["genres"].str.split("|"):
    genres.extend(genre)
    
genres = list(set(genres))

for genre in genres:
    ml_movies[genre] = ml_movies["genres"].apply(lambda gs: int(genre in gs.split("|")))
    
ml_movies.drop(columns=["genres"], inplace=True)

In [45]:
ml_movies

Unnamed: 0,movie_id,primary_title,date,original_title,title_processed,Drama,Sci-Fi,Horror,Mystery,Musical,...,Action,Documentary,War,Thriller,Comedy,Romance,Crime,Children's,Adventure,Film-Noir
0,movie_1,Toy Story,1995,,toystory,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,movie_2,Jumanji,1995,,jumanji,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,movie_3,Grumpier Old Men,1995,,grumpieroldmen,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,movie_4,Waiting to Exhale,1995,,waitingtoexhale,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,movie_5,Father of the Bride Part II,1995,,fatherofthebridepartii,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,movie_3948,Meet the Parents,2000,,meettheparents,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3879,movie_3949,Requiem for a Dream,2000,,requiemforadream,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,movie_3950,Tigerland,2000,,tigerland,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,movie_3951,Two Family House,2000,,twofamilyhouse,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preprocess IMDB dataset

In [81]:
# Copy that we will use throughout the project

imdb_name = name_basics.copy()
imdb_title = title_basics.copy()
imdb_rating = title_ratings.copy()

In [82]:
# Print all different titleType 
print("Different genres:", imdb_title["titleType"].unique())

# Print the number of each titleType
print("Number of each titleType:")
print(imdb_title["titleType"].value_counts())

Different genres: ['short' 'movie' 'tvShort' 'tvMovie' 'tvSeries' 'tvEpisode' 'tvMiniSeries'
 'tvSpecial' 'video' 'videoGame' 'tvPilot']
Number of each titleType:
titleType
tvEpisode       8338625
short           1002063
movie            684707
video            294010
tvSeries         265733
tvMovie          148198
tvMiniSeries      55151
tvSpecial         48140
videoGame         39162
tvShort           10342
tvPilot               1
Name: count, dtype: int64


In [83]:
size_before = imdb_title.shape[0]

imdb_title = imdb_title[imdb_title["titleType"].isin(["movie", "short", "tvSeries"])]

size_after = imdb_title.shape[0]
print(f"Number of rows removed: {size_before - size_after}")

Number of rows removed: 8933629


In [84]:
# Rename 
imdb_title.rename(columns={"primaryTitle": "primary_title", "originalTitle": "original_title", "startYear": "date"}, inplace=True)

# Drop columns isAdult, endYear, runtimeMinutes and titleType
imdb_title.drop(columns=["isAdult", "endYear", "runtimeMinutes", "titleType"], inplace=True)

In [85]:
genres = []
for genre in imdb_title["genres"].str.split(","):
    genres.extend(genre)
    
genres = list(set(genres))

for genre in genres:
    imdb_title[genre] = imdb_title["genres"].apply(lambda gs: int(genre in gs.split(",") if type(gs) == str else False))
    
imdb_title.drop(columns=["genres"], inplace=True)

Unnamed: 0,tconst,primary_title,original_title,date,Sci-Fi,Western,Mystery,Game-Show,Animation,\N,...,Thriller,Music,Crime,Reality-TV,News,Action,War,Adult,Comedy,Biography
0,tt0000001,Carmencita,Carmencita,1894,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt0000002,Le clown et ses chiens,Le clown et ses chiens,1892,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0000003,Pauvre Pierrot,Pauvre Pierrot,1892,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,tt0000004,Un bon bock,Un bon bock,1892,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,tt0000005,Blacksmith Scene,Blacksmith Scene,1893,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10886072,tt9916730,6 Gunn,6 Gunn,2017,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10886082,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,2013,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10886083,tt9916756,Pretty Pretty Black Girl,Pretty Pretty Black Girl,2019,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10886087,tt9916764,38,38,2018,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
