# Movie recommendation system using collaborative filtering

## Dataset treatment

In [1]:
import pandas as pd

df = pd.read_csv('data/MovieLens_data/ratings.csv')
df.drop('timestamp', axis=1, inplace=True)

# drop columns with missing userId, movieId or rating
df = df.dropna(subset=['userId', 'movieId', 'rating'])

# verify if there are any missing values left
missing_values = df.isna().sum()
print(missing_values == 0)

# saving the cleaned dataset
df.to_csv('data/final_data/ratings.csv', index=False)

df

userId     True
movieId    True
rating     True
dtype: bool


Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
20000258,138493,68954,4.5
20000259,138493,69526,4.5
20000260,138493,69644,3.0
20000261,138493,70286,5.0


Since the dataset is organized, there is no need to make any changes to it. The only thing that was done was to remove the timestamp column, which is not necessary for the analysis. And at the end of the process, the cleaned dataset was saved in a new file.
The results will come in the form of a Movie id and the estimated rating for that movie by the user. The results (movie id) will be search in the movies.csv file to get the movie title for the estimated score.

## Joining the information from movies.csv to make a complete movies dataset

### Merging the datasets for the movie ids

In [2]:
import pandas as pd

# load the datasets
df = pd.read_csv('data/MovieLens_data/ratings.csv')
df.drop('timestamp', axis=1, inplace=True)
movies = pd.read_csv('data/MovieLens_data/movies.csv')
movies_links = pd.read_csv("data/MovieLens_data/links.csv")

# merge the datasets on 'movieId'
merged_df = df.merge(movies[['movieId', 'title']], on='movieId')

# select only 'movieId' and 'title' columns
final_df = merged_df[['movieId', 'title']]

# merge with the links dataset to get the imdbId and tmdbId
final_df = final_df.merge(movies_links[['movieId', 'imdbId', 'tmdbId']], on='movieId')

# making the tmdbId a integer
final_df['tmdbId'] = final_df['tmdbId'].astype('Int64')

# removing the title from the final_df
final_df.drop('title', axis=1, inplace=True)

# saving the organized dataset
final_df.to_csv('data/final_data/movies_ids.csv', index=False)

final_df

Unnamed: 0,movieId,imdbId,tmdbId
0,2,113497,8844
1,29,112682,902
2,32,114746,63
3,47,114369,807
4,50,114814,629
...,...,...,...
20000258,68954,1049413,14160
20000259,69526,1055369,8373
20000260,69644,1080016,8355
20000261,70286,1136608,17654


### Merging Imdb dataset information for full movies data

In [3]:
# joining the information from movies.csv to make a complete movies dataset
imdb_movies = pd.read_csv('data/Imdb_data/title.basics.tsv', sep='\t', low_memory=False)

# change all \N entries to NaN
imdb_movies.replace(r'\N', pd.NA, inplace=True)
imdb_movies

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
11136774,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,,,"Action,Drama,Family"
11136775,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
11136776,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
11136777,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [4]:
# see all possible title types
print(imdb_movies['titleType'].unique())

# see all possible genres
print(imdb_movies['genres'].unique())

# join short and tvShort into short
imdb_movies.replace({'titleType': {'tvShort': 'short'}}, inplace=True)

# join movie and tvMovie into movies
imdb_movies.replace({'titleType': {'tvMovie': 'movies'}}, inplace=True)

print(imdb_movies.columns)
imdb_movies

['short' 'movie' 'tvShort' 'tvMovie' 'tvEpisode' 'tvSeries' 'tvMiniSeries'
 'tvSpecial' 'video' 'videoGame' 'tvPilot']
['Documentary,Short' 'Animation,Short' 'Animation,Comedy,Romance' ...
 'Biography,Crime,Fantasy' 'Mystery,Reality-TV,Thriller'
 'Musical,Reality-TV,Talk-Show']
Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
11136774,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,,,"Action,Drama,Family"
11136775,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
11136776,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
11136777,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [5]:
# pandas settings 
pd.set_option('future.no_silent_downcasting', True)

# replace tconst with imdbId
imdb_movies['tconst'] = imdb_movies['tconst'].str[2:]
imdb_movies.rename(columns={'tconst': 'imdbId'}, inplace=True)

# removing all 0s from start of the imdbId
imdb_movies['imdbId'] = imdb_movies['imdbId'].str.lstrip('0')

# Ensure the isAdult column is of integer type
imdb_movies['isAdult'] = imdb_movies['isAdult'].fillna(0).astype(int)

# Replace 1's and 0's with True and False in the isAdult column
imdb_movies['isAdult'] = imdb_movies['isAdult'].replace({1: True, 0: False})

# replace non-numeric values in runtimeMinutes with NaN
imdb_movies['runtimeMinutes'] = pd.to_numeric(imdb_movies['runtimeMinutes'], errors='coerce')

# convert the runtimeMinutes column to Int64
imdb_movies['runtimeMinutes'] = imdb_movies['runtimeMinutes'].astype('Int64')

# convert the startYear and endYear columns to Int64
imdb_movies['startYear'] = imdb_movies['startYear'].astype('Int64')
imdb_movies['endYear'] = imdb_movies['endYear'].astype('Int64')

# saving the organized dataset
imdb_movies.to_csv('data/final_data/imdb_movies.csv', index=False)

print(imdb_movies.columns)
imdb_movies

Index(['imdbId', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')


Unnamed: 0,imdbId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,1,short,Carmencita,Carmencita,False,1894,,1,"Documentary,Short"
1,2,short,Le clown et ses chiens,Le clown et ses chiens,False,1892,,5,"Animation,Short"
2,3,short,Pauvre Pierrot,Pauvre Pierrot,False,1892,,5,"Animation,Comedy,Romance"
3,4,short,Un bon bock,Un bon bock,False,1892,,12,"Animation,Short"
4,5,short,Blacksmith Scene,Blacksmith Scene,False,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
11136774,9916848,tvEpisode,Episode #3.17,Episode #3.17,False,2009,,,"Action,Drama,Family"
11136775,9916850,tvEpisode,Episode #3.19,Episode #3.19,False,2010,,,"Action,Drama,Family"
11136776,9916852,tvEpisode,Episode #3.20,Episode #3.20,False,2010,,,"Action,Drama,Family"
11136777,9916856,short,The Wind,The Wind,False,2015,,27,Short


#### Getting all ids in a single file

In [6]:
# Correlating the movies rated in the MovieLens dataset with the movies from the imdb dataset
movies = pd.read_csv('data/final_data/imdb_movies.csv', low_memory=False) # contains all info on movies from imdb
links = pd.read_csv('data/final_data/movies_ids.csv', low_memory=False) # contains the movieId, imdbId and tmdbId

# merging by imdbId
movies = movies.merge(links, on='imdbId')

# remove movies with same imdbId
movies = movies.drop_duplicates(subset='imdbId')

# saving the organized dataset
movies.to_csv('data/final_data/movies.csv', index=False)

movies

Unnamed: 0,imdbId,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,movieId,tmdbId
0,5,short,Blacksmith Scene,Blacksmith Scene,False,1893.0,,1.0,"Comedy,Short",95541,16624.0
4,8,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,False,1894.0,,1.0,"Documentary,Short",88674,105158.0
11,10,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,False,1895.0,,1.0,"Documentary,Short",120869,774.0
12,12,short,The Arrival of a Train,L'arrivée d'un train à La Ciotat,False,1896.0,,1.0,"Documentary,Short",98981,160.0
28,14,short,The Waterer Watered,L'arroseur arrosé,False,1895.0,,1.0,"Comedy,Short",113048,82120.0
...,...,...,...,...,...,...,...,...,...,...,...
19984851,4377918,movie,Crazy Beautiful You,Crazy Beautiful You,False,2015.0,,114.0,"Comedy,Drama,Romance",130089,327225.0
19984852,4397346,movie,Bikes vs Cars,Bikes vs Cars,False,2015.0,,90.0,Documentary,129822,324260.0
19984856,4438688,movie,Polskie gówno,Polskie gówno,False,2014.0,,93.0,"Comedy,Musical",128734,
19984857,4475970,short,Power Rangers,Power Rangers,False,2015.0,,14.0,"Action,Sci-Fi,Short",130842,327029.0


In [7]:
# joining the movie.csv file with movie_details.csv by tmdbId
movie_details = pd.read_csv('data/final_data/movie_details.csv')
movies = pd.read_csv('data/final_data/movies.csv')

# rename id columns
movies.rename(columns={'tmdbId': 'tmdb_id'}, inplace=True)
movies.rename(columns={'imdbId': 'imdb_id'}, inplace=True)
movies.rename(columns={'movieId': 'movie_id'}, inplace=True)

# merging the datasets
movies = movies.merge(movie_details, on='tmdb_id')

# removing the title column
movies.drop('title', axis=1, inplace=True)

# putting all ids as starting columns
movies = movies[['movie_id', 'imdb_id', 'tmdb_id', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres', 'backdrop_path', 'poster_path', 'original_language', 'overview', 'video']]

movies

Unnamed: 0,movie_id,imdb_id,tmdb_id,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,backdrop_path,poster_path,original_language,overview,video
0,95541,5,16624.0,Blacksmith Scene,Blacksmith Scene,False,1893.0,,1.0,"Comedy,Short",/mDD99APoTgMuNJrkmAfGicooJHa.jpg,/c76bs0S90EFhB5ww3i6DlYQTVk.jpg,xx,Three men hammer on an anvil and pass a bottle...,False
1,88674,8,105158.0,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,False,1894.0,,1.0,"Documentary,Short",/zaPZItgUO3xicz0QBqrDKNuPSbD.jpg,/s7fhher78hv1I5tKl7NbgnwsKha.jpg,xx,A man (Thomas Edison's assistant) takes a pinc...,False
2,120869,10,774.0,Leaving the Factory,La sortie de l'usine Lumière à Lyon,False,1895.0,,1.0,"Documentary,Short",/wNxy6Fqvjh0FsIuk7cgqNNq5PhP.jpg,/cT2sefAXgEoICJUCEM6UfxXfuDM.jpg,fr,Working men and women leave through the main g...,False
3,98981,12,160.0,The Arrival of a Train,L'arrivée d'un train à La Ciotat,False,1896.0,,1.0,"Documentary,Short",/uusNog5m2aCuL53rrKw8RaBnprb.jpg,/m5HSlaNCzwV95rAriDmT19el5h1.jpg,fr,A group of people are standing along the platf...,False
4,113048,14,82120.0,The Waterer Watered,L'arroseur arrosé,False,1895.0,,1.0,"Comedy,Short",/A0hONP6cxTthAsw6IvPMX4Aowkr.jpg,/rSZghvrFWTGqi4UecyG9jimzpEO.jpg,fr,"A gardener is watering his flowers, when a mis...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26183,128624,4368334,323431.0,Petting Zoo,Petting Zoo,False,2015.0,,93.0,Drama,/m0RDGraOSHznV5q9GBLEmwz98Os.jpg,/11a3Gt9Q4X93hqfAsSItZ01oDwh.jpg,en,"A story of love, sex and teen pregnancy in San...",False
26184,127098,4368814,321594.0,Louis C.K.: Live at the Comedy Store,Louis C.K.: Live at the Comedy Store,False,2015.0,,66.0,Comedy,/b0gVgI3ptWba6xME5rmCuvFVw0.jpg,/9ULqaND2rzNFL4OAxCbSrY7UWem.jpg,en,This material was developed and prepared over ...,False
26185,130089,4377918,327225.0,Crazy Beautiful You,Crazy Beautiful You,False,2015.0,,114.0,"Comedy,Drama,Romance",/d8ubvqYoYKlnWyrTPmJd6A9Gv7m.jpg,/sZAi6bz8I31NrVPUAkza9qtxqZE.jpg,tl,A bad girl and a province boy found love in th...,False
26186,129822,4397346,324260.0,Bikes vs Cars,Bikes vs Cars,False,2015.0,,90.0,Documentary,/sV5Pk6zJR4Fsu8jtVDLN7BBjJUy.jpg,/8ARMq3gIxlzFcxSDQPSVmiE32KL.jpg,en,Bikes vs Cars depicts a global crisis that we ...,False


In [8]:
# dropping the videos column, since it's a column with only a False flag
movies.drop('video', axis=1, inplace=True)

# getting video info from the movie_videos.csv file
movie_videos = pd.read_csv('data/final_data/movie_videos.csv')

# merging the datasets
movies = movies.merge(movie_videos, on='tmdb_id')

# making all xx entries in original_language Nan
movies['original_language'] = movies['original_language'].replace('xx', pd.NA)

# making startYear and runtimeMinutes column
movies['startYear'] = movies['startYear'].astype('Int64')

# made tmdb_id a integer column
movies['tmdb_id'] = movies['tmdb_id'].astype('Int64')

movies

Unnamed: 0,movie_id,imdb_id,tmdb_id,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,backdrop_path,poster_path,original_language,overview,video_key
0,113048,14,82120,The Waterer Watered,L'arroseur arrosé,False,1895,,1.0,"Comedy,Short",/A0hONP6cxTthAsw6IvPMX4Aowkr.jpg,/rSZghvrFWTGqi4UecyG9jimzpEO.jpg,fr,"A gardener is watering his flowers, when a mis...",KXf_Stwe_k4
1,32898,417,775,A Trip to the Moon,Le voyage dans la lune,False,1902,,13.0,"Action,Adventure,Comedy",/jeC1kzwWnZTuXl7xF4E5D70BD8c.jpg,/9o0v5LLFk51nyTBHZSre6OB37n2.jpg,fr,Professor Barbenfouillis and five of his colle...,JEGIyo-dKmA
2,49389,439,5698,The Great Train Robbery,The Great Train Robbery,False,1903,,11.0,"Action,Adventure,Crime",/99uGRUGS2qxNvPARiM8dkuTrUnW.jpg,/vEYr1sJR1dOFGXwXawpBN6hDRGF.jpg,en,After the train station clerk is assaulted and...,In3mRDX0uqk
3,86518,3419,28627,The Student of Prague,Der Student von Prag,False,1913,,85.0,"Drama,Fantasy,Horror",/pSrl9LPOKVGLh9DAvEaAwlpZAYz.jpg,/tO2zgNEY0MqtvEyPioTTfHgmN2Q.jpg,de,"Prague, Bohemia, 1820. Balduin, a penniless st...",LrqUenAgVBU
4,127054,4008,100246,Gertie the Dinosaur,Gertie the Dinosaur,False,1914,,12.0,"Animation,Comedy,Family",/dJQJPvEqs3Yzvam3djqnLJNJ70S.jpg,/fH5c2a3Nti765GqG8jj0uMBLgIK.jpg,en,Although not the first feature-length animated...,32pzHWUTcPc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19819,127152,4257858,318224,Going Clear: Scientology & the Prison of Belief,Going Clear: Scientology and the Prison of Belief,False,2015,,119.0,Documentary,/29VoISagzOfJcuuhFzKQg9ZmEUU.jpg,/zWHQYR9srde9nZjU6Dot8qTjoni.jpg,en,GOING CLEAR intimately profiles eight former m...,ixgd38EZIR0
19820,130466,4300028,328346,I Hate Christian Laettner,I Hate Christian Laettner,False,2015,,90.0,"Biography,Documentary,History",/f4S4QdNVPDJn7TkTvjRoimOVzy0.jpg,/pdzT3ftIWJH7bdQyKI1gQS8SFgu.jpg,en,He made perhaps the most dramatic shot in the ...,HNk19TXqnq0
19821,128612,4358230,319999,Body,Cialo,False,2015,,90.0,"Comedy,Drama",/9Ro9GeYLc6JLHxUygYxpoYgBaZc.jpg,/ob1idQ0vpXxrEkUj03qKTom6hPn.jpg,pl,"A busy attorney, worried that his anorexic dau...",ZZ-3VAxWwnk
19822,127098,4368814,321594,Louis C.K.: Live at the Comedy Store,Louis C.K.: Live at the Comedy Store,False,2015,,66.0,Comedy,/b0gVgI3ptWba6xME5rmCuvFVw0.jpg,/9ULqaND2rzNFL4OAxCbSrY7UWem.jpg,en,This material was developed and prepared over ...,2Lcng7xgOzE


In [9]:
# getting score average for movies

# load the dataset
ratings = pd.read_csv('data/final_data/ratings.csv')

# removing movie entries that are not in the movies dataset
ratings = ratings[ratings['movieId'].isin(movies['movie_id'])]

# get the average rating for each movie
average_rating = ratings.groupby('movieId')['rating'].mean()

# merge the average rating with the movies dataset
movies = movies.merge(average_rating, left_on='movie_id', right_on='movieId')

# saving the organized dataset
movies.to_csv('data/final_data/movies_full.csv', index=False)

# saving the new ratings dataset
ratings.to_csv('data/final_data/ratings_full.csv', index=False)

movies

Unnamed: 0,movie_id,imdb_id,tmdb_id,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,backdrop_path,poster_path,original_language,overview,video_key,rating
0,113048,14,82120,The Waterer Watered,L'arroseur arrosé,False,1895,,1.0,"Comedy,Short",/A0hONP6cxTthAsw6IvPMX4Aowkr.jpg,/rSZghvrFWTGqi4UecyG9jimzpEO.jpg,fr,"A gardener is watering his flowers, when a mis...",KXf_Stwe_k4,2.250000
1,32898,417,775,A Trip to the Moon,Le voyage dans la lune,False,1902,,13.0,"Action,Adventure,Comedy",/jeC1kzwWnZTuXl7xF4E5D70BD8c.jpg,/9o0v5LLFk51nyTBHZSre6OB37n2.jpg,fr,Professor Barbenfouillis and five of his colle...,JEGIyo-dKmA,3.738189
2,49389,439,5698,The Great Train Robbery,The Great Train Robbery,False,1903,,11.0,"Action,Adventure,Crime",/99uGRUGS2qxNvPARiM8dkuTrUnW.jpg,/vEYr1sJR1dOFGXwXawpBN6hDRGF.jpg,en,After the train station clerk is assaulted and...,In3mRDX0uqk,3.340909
3,86518,3419,28627,The Student of Prague,Der Student von Prag,False,1913,,85.0,"Drama,Fantasy,Horror",/pSrl9LPOKVGLh9DAvEaAwlpZAYz.jpg,/tO2zgNEY0MqtvEyPioTTfHgmN2Q.jpg,de,"Prague, Bohemia, 1820. Balduin, a penniless st...",LrqUenAgVBU,3.250000
4,127054,4008,100246,Gertie the Dinosaur,Gertie the Dinosaur,False,1914,,12.0,"Animation,Comedy,Family",/dJQJPvEqs3Yzvam3djqnLJNJ70S.jpg,/fH5c2a3Nti765GqG8jj0uMBLgIK.jpg,en,Although not the first feature-length animated...,32pzHWUTcPc,3.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19819,127152,4257858,318224,Going Clear: Scientology & the Prison of Belief,Going Clear: Scientology and the Prison of Belief,False,2015,,119.0,Documentary,/29VoISagzOfJcuuhFzKQg9ZmEUU.jpg,/zWHQYR9srde9nZjU6Dot8qTjoni.jpg,en,GOING CLEAR intimately profiles eight former m...,ixgd38EZIR0,3.600000
19820,130466,4300028,328346,I Hate Christian Laettner,I Hate Christian Laettner,False,2015,,90.0,"Biography,Documentary,History",/f4S4QdNVPDJn7TkTvjRoimOVzy0.jpg,/pdzT3ftIWJH7bdQyKI1gQS8SFgu.jpg,en,He made perhaps the most dramatic shot in the ...,HNk19TXqnq0,3.000000
19821,128612,4358230,319999,Body,Cialo,False,2015,,90.0,"Comedy,Drama",/9Ro9GeYLc6JLHxUygYxpoYgBaZc.jpg,/ob1idQ0vpXxrEkUj03qKTom6hPn.jpg,pl,"A busy attorney, worried that his anorexic dau...",ZZ-3VAxWwnk,3.250000
19822,127098,4368814,321594,Louis C.K.: Live at the Comedy Store,Louis C.K.: Live at the Comedy Store,False,2015,,66.0,Comedy,/b0gVgI3ptWba6xME5rmCuvFVw0.jpg,/9ULqaND2rzNFL4OAxCbSrY7UWem.jpg,en,This material was developed and prepared over ...,2Lcng7xgOzE,3.800000


#### Adding actors and directors to the movies dataset

In [10]:
people = pd.read_csv('data/Imdb_data/name.basics.tsv', sep='\t', low_memory=False)

# drop all columns except nconst and primaryName
people = people[['nconst', 'primaryName']]

# dropping all rows with NaN values
people = people.dropna()

crew = pd.read_csv('data/Imdb_data/title.crew.tsv', sep='\t', low_memory=False)

# dropping the writers column
crew = crew.drop('writers', axis=1)

# change all \N to NaN
crew.replace(r'\N', pd.NA, inplace=True)

# drop all rows with NaN values
crew = crew.dropna()

# rename directors column to nconst
crew.rename(columns={'directors': 'nconst'}, inplace=True)

# join the crew dataset with the people dataset
crew = crew.merge(people, on='nconst')

# drop the nconst column
crew = crew.drop('nconst', axis=1)

# renaming the primaryName column to directors
crew.rename(columns={'primaryName': 'directors'}, inplace=True)

# renaming the tconst column to imdbId
crew.rename(columns={'tconst': 'imdb_id'}, inplace=True)

# remove all 0s and tt from the start of the imdbId
crew['imdb_id'] = crew['imdb_id'].str[2:]
crew['imdb_id'] = crew['imdb_id'].str.lstrip('0')

# saving the directors dataset
crew.to_csv('data/final_data/directors.csv', index=False)

crew

Unnamed: 0,imdb_id,directors
0,1,William K.L. Dickson
1,2,Émile Reynaud
2,3,Émile Reynaud
3,4,Émile Reynaud
4,5,William K.L. Dickson
...,...,...
5212708,9916848,Hamdi Alkan
5212709,9916850,Hamdi Alkan
5212710,9916852,Hamdi Alkan
5212711,9916856,Johan Planefeldt


In [11]:
# empty list to hold the chunks
chunks = []

# chunk size
chunk_size = 100000

# read the file in chunks
for chunk in pd.read_csv('data/Imdb_data/name.basics.tsv', sep='\t', low_memory=False, chunksize=chunk_size):
    # drop all columns except nconst and primaryName
    chunk = chunk[['nconst', 'primaryName']]
    # drop rows with NaN values
    chunk = chunk.dropna()
    # append the processed chunk to the list
    chunks.append(chunk)

# concatenate all chunks into a single DataFrame
people = pd.concat(chunks, ignore_index=True)

In [12]:
# empty list to hold the chunks
chunks = []

# chunk size
chunk_size = 100000

# read the file in chunks
for chunk in pd.read_csv('data/Imdb_data/title.principals.tsv', sep='\t', low_memory=False, chunksize=chunk_size):
    # drop unnecessary columns
    chunk = chunk.drop(['ordering', 'job', 'characters'], axis=1)
    # filter only actors and actresses
    chunk = chunk[chunk['category'].isin(['actor', 'actress'])]
    # drop the category column
    chunk = chunk.drop('category', axis=1)
    # append the processed chunk to the list
    chunks.append(chunk)

# concatenate all chunks into a single DataFrame
actors = pd.concat(chunks, ignore_index=True)

In [13]:
import pandas as pd

# Load the people DataFrame
people = pd.read_csv('data/Imdb_data/name.basics.tsv', sep='\t', low_memory=False)
people = people[['nconst', 'primaryName']].dropna()

# empty list to hold the chunks
chunks = []

# chunk size
chunk_size = 100000

# read the actors DataFrame in chunks
for chunk in pd.read_csv('data/Imdb_data/title.principals.tsv', sep='\t', low_memory=False, chunksize=chunk_size):
    # drop unnecessary columns
    chunk = chunk.drop(['ordering', 'job', 'characters'], axis=1)
    # filter only actors and actresses
    chunk = chunk[chunk['category'].isin(['actor', 'actress'])]
    # drop the category column
    chunk = chunk.drop('category', axis=1)
    # merge the chunk with the people DataFrame
    chunk = chunk.merge(people, on='nconst')
    # append the processed chunk to the list
    chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
actors = pd.concat(chunks, ignore_index=True)

# Rename columns and clean up the DataFrame
actors.rename(columns={'tconst': 'imdb_id', 'primaryName': 'actors'}, inplace=True)
actors['imdb_id'] = actors['imdb_id'].str[2:].str.lstrip('0')

# Save the actors DataFrame
actors.to_csv('data/final_data/actors.csv', index=False)

actors

Unnamed: 0,imdb_id,nconst,actors
0,5,nm0443482,Charles Kayser
1,5,nm0653042,John Ott
2,7,nm0179163,James J. Corbett
3,7,nm0183947,Peter Courtney
4,8,nm0653028,Fred Ott
...,...,...,...
36914895,9916880,nm2676923,Joanna Ruiz
36914896,9916880,nm2676923,Joanna Ruiz
36914897,9916880,nm2676923,Joanna Ruiz
36914898,9916880,nm1469295,Emma Tate


In [16]:
# merging the actors and directors datasets with the movies dataset

# load the movies dataset
movies = pd.read_csv('data/final_data/movies_full.csv')
# load the directors dataset
directors = pd.read_csv('data/final_data/directors.csv')
# load the actors dataset
actors = pd.read_csv('data/final_data/actors.csv')

# join the actors of same movies
actors = actors.groupby('imdb_id')['actors'].apply(lambda x: ','.join(x)).reset_index()

# join the directors of same movies
directors = directors.groupby('imdb_id')['directors'].apply(lambda x: ','.join(x)).reset_index()

# merge the datasets
movies = movies.merge(directors, on='imdb_id')
movies = movies.merge(actors, on='imdb_id')

# saving the organized dataset
movies.to_csv('data/final_data/movies_full.csv', index=False)

movies

Unnamed: 0,movie_id,imdb_id,tmdb_id,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,backdrop_path,poster_path,original_language,overview,video_key,rating,directors,actors
0,113048,14,82120,The Waterer Watered,L'arroseur arrosé,False,1895,,1.0,"Comedy,Short",/A0hONP6cxTthAsw6IvPMX4Aowkr.jpg,/rSZghvrFWTGqi4UecyG9jimzpEO.jpg,fr,"A gardener is watering his flowers, when a mis...",KXf_Stwe_k4,2.250000,Louis Lumière,"François Clerc,Benoît Duval"
1,32898,417,775,A Trip to the Moon,Le voyage dans la lune,False,1902,,13.0,"Action,Adventure,Comedy",/jeC1kzwWnZTuXl7xF4E5D70BD8c.jpg,/9o0v5LLFk51nyTBHZSre6OB37n2.jpg,fr,Professor Barbenfouillis and five of his colle...,JEGIyo-dKmA,3.738189,Georges Méliès,"Georges Méliès,Georges Méliès,Victor André,Ble..."
2,49389,439,5698,The Great Train Robbery,The Great Train Robbery,False,1903,,11.0,"Action,Adventure,Crime",/99uGRUGS2qxNvPARiM8dkuTrUnW.jpg,/vEYr1sJR1dOFGXwXawpBN6hDRGF.jpg,en,After the train station clerk is assaulted and...,In3mRDX0uqk,3.340909,Edwin S. Porter,"Gilbert M. 'Broncho Billy' Anderson,Gilbert M...."
3,127054,4008,100246,Gertie the Dinosaur,Gertie the Dinosaur,False,1914,,12.0,"Animation,Comedy,Family",/dJQJPvEqs3Yzvam3djqnLJNJ70S.jpg,/fH5c2a3Nti765GqG8jj0uMBLgIK.jpg,en,Although not the first feature-length animated...,32pzHWUTcPc,3.500000,Winsor McCay,"Winsor McCay,George McManus,Roy L. McCardell,T..."
4,93333,4936,53410,The Bank,The Bank,False,1915,,25.0,"Comedy,Short",/zN0gtbECZBHOw4qiMrmDxUwdLm5.jpg,/3L6YgCq5vR9wUgaKy1MzpeqrN9X.jpg,en,A janitor at a bank is in love with a secretar...,A9w36CH5krQ,2.700000,Charles Chaplin,"Charles Chaplin,Edna Purviance,Billy Armstrong..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17472,122147,4186390,315362,Familjen,Familjen,False,2015,,90.0,"Crime,Mystery,Thriller",/xnzsml50TUShiNqDjbxg8qIbSkg.jpg,/fF30DiIwFp14Od9yvpcETI6tXIq.jpg,sv,When a well-known crime boss is murdered by a ...,815mgtrILMY,2.250000,Mårten Klingberg,"Peter Haber,Mikael Persbrandt,Ingvar Hirdwall,..."
17473,129826,4189260,322456,Lego DC Comics Super Heroes: Justice League vs...,Lego DC Comics Super Heroes: Justice League vs...,False,2015,,49.0,"Action,Adventure,Animation",/pbUhiefOhOFaaK06UorRy2rvaGT.jpg,/bp0exZFHBLV0gWNXkBINtymOeom.jpg,en,"Superman’s clone, Bizarro, has become an embar...",V_Ye2lkTY54,2.500000,Brandon Vietti,"Diedrich Bader,Diedrich Bader,Troy Baker,Troy ..."
17474,125918,4189442,305932,Expelled,Expelled,False,2014,,85.0,Comedy,/6L5vre1eJu0dppvk6kGfVuSGNyC.jpg,/9p7VzMhvchlA4WdpgAXBnFg7l2I.jpg,en,Felix is a legendary prankster who gets expell...,h42uyV8al30,2.666667,Alex Goyette,"Cameron Dallas,Matt Shively,Lia Marie Johnson,..."
17475,126482,4191054,302429,Strange Magic,Strange Magic,False,2015,,99.0,"Adventure,Animation,Comedy",/aUbyQhWcPS7S0Su2d9Tgk3NxdPk.jpg,/vjCdrK8gGRFnyuZb1j9BzgN2RaY.jpg,en,A love potion works its devious charms on fair...,E0HD-OAmHw0,3.125000,Gary Rydstrom,"Evan Rachel Wood,Elijah Kelley,Kristin Chenowe..."


### Final changes

In [17]:
# load movies final dataset
movies = pd.read_csv('data/final_data/movies_full.csv')

# get all possible genres
genres = movies['genres'].str.split(',').explode().unique()

print(genres)

['Comedy' 'Short' 'Action' 'Adventure' 'Crime' 'Animation' 'Family'
 'Drama' 'War' 'History' 'Western' 'Romance' 'Horror' 'Mystery' 'Thriller'
 'Fantasy' 'Sci-Fi' 'Documentary' 'Sport' 'Music' 'Musical' 'Biography'
 'Film-Noir' 'Adult' nan 'News']


In [19]:
# find all movies with a specific title
movies = pd.read_csv('data/final_data/movies_full.csv')

# count all movies with the same movie_id
num = movies['movie_id'].value_counts()

# drop all movies with the same movie_id
movies = movies.drop_duplicates(subset='movie_id')

# save the organized dataset
movies.to_csv('data/final_data/movies_full.csv', index=False)

movies

Unnamed: 0,movie_id,imdb_id,tmdb_id,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,backdrop_path,poster_path,original_language,overview,video_key,rating,directors,actors
0,113048,14,82120,The Waterer Watered,L'arroseur arrosé,False,1895,,1.0,"Comedy,Short",/A0hONP6cxTthAsw6IvPMX4Aowkr.jpg,/rSZghvrFWTGqi4UecyG9jimzpEO.jpg,fr,"A gardener is watering his flowers, when a mis...",KXf_Stwe_k4,2.250000,Louis Lumière,"François Clerc,Benoît Duval"
1,32898,417,775,A Trip to the Moon,Le voyage dans la lune,False,1902,,13.0,"Action,Adventure,Comedy",/jeC1kzwWnZTuXl7xF4E5D70BD8c.jpg,/9o0v5LLFk51nyTBHZSre6OB37n2.jpg,fr,Professor Barbenfouillis and five of his colle...,JEGIyo-dKmA,3.738189,Georges Méliès,"Georges Méliès,Georges Méliès,Victor André,Ble..."
2,49389,439,5698,The Great Train Robbery,The Great Train Robbery,False,1903,,11.0,"Action,Adventure,Crime",/99uGRUGS2qxNvPARiM8dkuTrUnW.jpg,/vEYr1sJR1dOFGXwXawpBN6hDRGF.jpg,en,After the train station clerk is assaulted and...,In3mRDX0uqk,3.340909,Edwin S. Porter,"Gilbert M. 'Broncho Billy' Anderson,Gilbert M...."
3,127054,4008,100246,Gertie the Dinosaur,Gertie the Dinosaur,False,1914,,12.0,"Animation,Comedy,Family",/dJQJPvEqs3Yzvam3djqnLJNJ70S.jpg,/fH5c2a3Nti765GqG8jj0uMBLgIK.jpg,en,Although not the first feature-length animated...,32pzHWUTcPc,3.500000,Winsor McCay,"Winsor McCay,George McManus,Roy L. McCardell,T..."
4,93333,4936,53410,The Bank,The Bank,False,1915,,25.0,"Comedy,Short",/zN0gtbECZBHOw4qiMrmDxUwdLm5.jpg,/3L6YgCq5vR9wUgaKy1MzpeqrN9X.jpg,en,A janitor at a bank is in love with a secretar...,A9w36CH5krQ,2.700000,Charles Chaplin,"Charles Chaplin,Edna Purviance,Billy Armstrong..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17451,122147,4186390,315362,Familjen,Familjen,False,2015,,90.0,"Crime,Mystery,Thriller",/xnzsml50TUShiNqDjbxg8qIbSkg.jpg,/fF30DiIwFp14Od9yvpcETI6tXIq.jpg,sv,When a well-known crime boss is murdered by a ...,815mgtrILMY,2.250000,Mårten Klingberg,"Peter Haber,Mikael Persbrandt,Ingvar Hirdwall,..."
17452,129826,4189260,322456,Lego DC Comics Super Heroes: Justice League vs...,Lego DC Comics Super Heroes: Justice League vs...,False,2015,,49.0,"Action,Adventure,Animation",/pbUhiefOhOFaaK06UorRy2rvaGT.jpg,/bp0exZFHBLV0gWNXkBINtymOeom.jpg,en,"Superman’s clone, Bizarro, has become an embar...",V_Ye2lkTY54,2.500000,Brandon Vietti,"Diedrich Bader,Diedrich Bader,Troy Baker,Troy ..."
17453,125918,4189442,305932,Expelled,Expelled,False,2014,,85.0,Comedy,/6L5vre1eJu0dppvk6kGfVuSGNyC.jpg,/9p7VzMhvchlA4WdpgAXBnFg7l2I.jpg,en,Felix is a legendary prankster who gets expell...,h42uyV8al30,2.666667,Alex Goyette,"Cameron Dallas,Matt Shively,Lia Marie Johnson,..."
17454,126482,4191054,302429,Strange Magic,Strange Magic,False,2015,,99.0,"Adventure,Animation,Comedy",/aUbyQhWcPS7S0Su2d9Tgk3NxdPk.jpg,/vjCdrK8gGRFnyuZb1j9BzgN2RaY.jpg,en,A love potion works its devious charms on fair...,E0HD-OAmHw0,3.125000,Gary Rydstrom,"Evan Rachel Wood,Elijah Kelley,Kristin Chenowe..."


In [21]:
# getting the ratings and removing the movies that are not in the movies dataset
ratings = pd.read_csv('data/final_data/ratings_full.csv')
movies = pd.read_csv('data/final_data/movies_full.csv')

# removing movie entries that are not in the movies dataset
ratings = ratings[ratings['movieId'].isin(movies['movie_id'])]

# removing the entries with missing values
ratings = ratings.dropna()

# saving the new ratings dataset
ratings.to_csv('data/final_data/ratings_full.csv', index=False)

ratings

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
5,1,112,3.5
...,...,...,...
19750592,138493,66762,4.5
19750593,138493,68319,4.5
19750595,138493,69526,4.5
19750597,138493,70286,5.0
