This notebook is meant to take a netflix watch history and create a "homepage" where multiple genres/categories are shown and then for each of those, movies are recommended

In [None]:
# This imports pandas and difflib, difflib is a standard library that checks for differences between strings

import pandas as pd
import difflib

In [None]:
# Importing my watch history, and the netflix data

dan = pd.read_csv("NetflixViewingHistoryDan.csv")
netflix = pd.read_csv("netflix_titles.csv")

In [None]:
# This cell applies a lambda function to each movie in the watch history that will find all the matches with a similarity
# ratio above .6 and return it into a pandas series object where each element is a list of movies from the netflix data

bestMatches = dan['Title'].apply(lambda x: difflib.get_close_matches(x, netflix['title']))
bestMatches.head(10)

In [None]:
# This cell loops through the series of lists and then the lists and finds the genre of the similar movies and
# then adds those similar movies' genres to the list of genres

genres = []
for i in range(len(bestMatches)): # loop through series
    itemGenres = []
    for j in range(len(bestMatches[i])): #loop through list of similar movies
        itemGenres.append(netflix['listed_in'].values[netflix['title'] == bestMatches[i][j]]) #finds movie and appends
    if len(itemGenres) > 0: #if similar movies and their genres were found
        for arr in itemGenres: #loops through list of arrays of genres
            genres += arr[0].replace(" ", "").split(",") #remove white space and splits string to list and adds to genres
genres[:10]

In [None]:
# This cell turns the list of genres into a series in order to use value_counts to find the most common genre

genreCounts = pd.Series(genres).value_counts()
genreCounts[:10]

In [None]:
# This cells takes the value counts and extracts the keys (genres) from the value_counts in order to search for the
# genres in the netflix data

listOfGenres = list(genreCounts.keys())
listOfGenres[:10]

In [None]:
# This cell loops through all of the rows in the netflix data and adds each movie to the dictionary where
# the keys are the genres

genresAndMovies = {}
for i in range(netflix.shape[0]): #loop through length of netflix data
    for gen in netflix.iloc[i]['listed_in'].replace(" ", "").split(","): #splits each row's listed in into list, loops through
        if not genresAndMovies.get(gen): #if genre not in dict, add it
            genresAndMovies[gen] = []
        genresAndMovies[gen].append(netflix.iloc[i]['title']) #append movie to each genres' list
#genresAndMovies

In [None]:
# This loops through the top 10 most common genres and then finds the movies with similarity to the users' watch history

closeMovies = {}
for i in range(10): #loops through top 10 genres
    key = listOfGenres[i] #key is the genre
    movies = genresAndMovies.get(key) #gets the list of movies for the genre
    closeMovies[key] = [] #sets genre equal to empty list fo teh similar movies
    listOfLists = list(dan["Title"].apply(lambda x: difflib.get_close_matches(x, movies))) #list of lists of similar
    for subList in listOfLists: #loops through lists of lists and adds sublist items to the closeMovies list
        closeMovies[key] += subList


In [None]:
# This loops through the similar movies and displays the top 20 (if not 20, as many as there are) and the genre

for key, value in closeMovies.items(): #loop through dict
    print(key) #print genre
    print(list(pd.Series(value).value_counts().keys())[:20]) #print the list of top 20
    print() #new line for spacing