In [2]:
import pandas as pd
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import sys
from shutil import rmtree

module_path = os.path.abspath(os.path.join('utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_loader import load_imdb_dataset

## Load IMDB dataset

In [3]:
name_basics, title_basics, title_ratings = load_imdb_dataset()

../data/imdb/name.basics.tsv.gz
../data/imdb/title.basics.tsv.gz
../data/imdb/title.ratings.tsv.gz
Downloading name.basics...
Extracting name.basics...
Downloading title.basics...
Extracting title.basics...
Downloading title.ratings...
Extracting title.ratings...
      nconst      primaryName birthYear deathYear  \
0  nm0000001     Fred Astaire      1899      1987   
1  nm0000002    Lauren Bacall      1924      2014   
2  nm0000003  Brigitte Bardot      1934        \N   
3  nm0000004     John Belushi      1949      1982   
4  nm0000005   Ingmar Bergman      1918      2007   

                    primaryProfession                           knownForTitles  
0        actor,miscellaneous,producer  tt0072308,tt0050419,tt0053137,tt0027125  
1  actress,soundtrack,archive_footage  tt0037382,tt0075213,tt0117057,tt0038355  
2   actress,music_department,producer  tt0057345,tt0049189,tt0056404,tt0054452  
3       actor,writer,music_department  tt0072562,tt0077975,tt0080455,tt0078723  
4           

In [4]:
# Copy that we will use throughout the project

imdb_name = name_basics.copy()
imdb_title = title_basics.copy()
imdb_rating = title_ratings.copy()

## Load MovieLens 1M dataset

In [5]:
data_path = 'data/'

# Remove ml-1m if it exists in the data folder
if 'ml-1m' in os.listdir(data_path):
    rmtree(data_path + 'ml-1m')
    
dsURL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip";
print(f"Downloading {dsURL[0]}...")
urlretrieve(dsURL[0], dsURL[1])

ZipFile(dsURL[1], "r").extractall(data_path)

# Remove the zip file
os.remove(dsURL[1])

Downloading http://files.grouplens.org/datasets/movielens/ml-1m.zip...


In [6]:
users = pd.read_csv(
    data_path + 'ml-1m/users.dat', 
    sep='::',
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine="python"
)

ratings = pd.read_csv(
    data_path + "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

movies = pd.read_csv(
    data_path + "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="latin-1",
)

print("Size for users:", users.shape)
print("Size for ratings:", ratings.shape)
print("Size for movies:", movies.shape)

Size for users: (6040, 5)
Size for ratings: (1000209, 4)
Size for movies: (3883, 3)


In [7]:
# Copy that we will use throughout the project

ml_users = users.copy()
ml_ratings = ratings.copy()
ml_movies = movies.copy()

## Preprocess MovieLens 1M dataset

In [8]:
ml_users["user_id"] = ml_users["user_id"].apply(lambda x: f"user_{x}")
ml_users["age_group"] = ml_users["age_group"].apply(lambda x: f"group_{x}")
ml_users["occupation"] = ml_users["occupation"].apply(lambda x: f"occupation_{x}")

ml_movies["movie_id"] = ml_movies["movie_id"].apply(lambda x: f"movie_{x}")
ml_movies["date"] = ml_movies["title"].apply(lambda x: x[-5:-1])
ml_movies["title"] = ml_movies["title"].apply(lambda x: x[:-7])
ml_movies["original_title"] = ml_movies["title"].str.extract(r"\((.*)\)")
ml_movies["title"] = ml_movies["title"].str.replace(r"\(.*\)", "").str.strip()

# For all the movies title that have ", The" or ", Les" at the end, we will move it to the beginning without the comma. End remove it from the end.
ml_movies["title"] = ml_movies["title"].apply(lambda x: "The " + x[:-5] if x[-5:] == ", The" else x)
ml_movies["title"] = ml_movies["title"].apply(lambda x: "Les " + x[:-5] if x[-5:] == ", Les" else x)

# Rename movies['title'] to movies['primary_title']
ml_movies.rename(columns={"title": "primary_title"}, inplace=True)

ml_ratings["movie_id"] = ml_ratings["movie_id"].apply(lambda x: f"movie_{x}")
ml_ratings["user_id"] = ml_ratings["user_id"].apply(lambda x: f"user_{x}")
ml_ratings["rating"] = ml_ratings["rating"].apply(lambda x: float(x))

In [9]:
ml_movies["title_processed"] = ml_movies["primary_title"].apply(lambda x: x.replace(" ", "").lower())
ml_movies["title_processed"] = ml_movies["title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [10]:
ml_movies

Unnamed: 0,movie_id,primary_title,genres,date,original_title,title_processed
0,movie_1,Toy Story,Animation|Children's|Comedy,1995,,toystory
1,movie_2,Jumanji,Adventure|Children's|Fantasy,1995,,jumanji
2,movie_3,Grumpier Old Men,Comedy|Romance,1995,,grumpieroldmen
3,movie_4,Waiting to Exhale,Comedy|Drama,1995,,waitingtoexhale
4,movie_5,Father of the Bride Part II,Comedy,1995,,fatherofthebridepartii
...,...,...,...,...,...,...
3878,movie_3948,Meet the Parents,Comedy,2000,,meettheparents
3879,movie_3949,Requiem for a Dream,Drama,2000,,requiemforadream
3880,movie_3950,Tigerland,Drama,2000,,tigerland
3881,movie_3951,Two Family House,Drama,2000,,twofamilyhouse


## Preprocess IMDB dataset

In [11]:
# Change genre seprator from , to | before cast the column to string

title_basics['genres'] = title_basics['genres'].astype(str)
title_basics['genres'] = title_basics['genres'].apply(lambda x: x.replace(',', '|'))

In [12]:
# merge the title_basics and title_ratings dataframes

title_basics_ratings = pd.merge(title_basics, title_ratings, on='tconst', how='inner')

In [13]:
# show the movies that has toy story in the title

title_basics_ratings[title_basics_ratings['primaryTitle'].str.contains('Toy Story')].head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
86627,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,Adventure|Animation|Comedy,8.3,1074033
91309,tt0120363,movie,Toy Story 2,Toy Story 2,0,1999,\N,92,Adventure|Animation|Comedy,7.9,622587
122115,tt0178952,videoGame,Toy Story: The Video Game,Toy Story,0,1995,\N,\N,Action|Adventure|Family,7.4,753
143131,tt0220070,video,The Story Behind 'Toy Story',The Story Behind 'Toy Story',0,1996,\N,27,Documentary|Short,7.1,153
154448,tt0245255,tvShort,The Making of 'Toy Story',To Infinity and Beyond: The Making of 'Toy Story',0,1995,\N,20,Documentary|Family|Short,7.1,119


In [None]:
# same in movies

movies[movies['title'].str.contains('Toy Story')].head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
3045,3114,Toy Story 2 (1999),Animation|Children's|Comedy
