In [17]:
# This file contains code that outputs a ranked list of reccomended shows/movies to watch based on an input of a Netflix
# vieweing history csv that can be downloaded from one's Netflix profile.  To use this code, change the csv file that is
# being fed into the clean_history function.  For the sake of example, my (Jarrett) viewing history file is the default,
# and any other history csv would need to be read into this file.  

# The objective of this algorithm was to use ONLY the information in the Netflix history csv to make a list of
# reccomendations.  That is, it only uses the title of the item watched, the number of episodes watched (if applicable), and
# the date on which particular episodes were watched.  We have access to more useful information, such as genre, rating,
# director, and cast, but the contents of this file intentionally omit them.  This is not intended to be the best algorithm,
# but a test for what is possible using only the contents of the history file.

# Each cell containts a comment explaining its contents.

In [18]:
# All necessary imports

import pandas as pd 
import numpy as np
import nltk
from datetime import date

In [19]:
# Thic cell reads in the database that we are using as the "contents of Netflix" for the sake of this algorithm.

items_df = pd.read_csv("netflix_titles.csv")
items_df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [20]:
# This cell extracts the titles of each item available ot be chosen on Netflix.

all_titles_series = items_df.iloc[:,2]
all_titles_series

0        Dick Johnson Is Dead
1               Blood & Water
2                   Ganglands
3       Jailbirds New Orleans
4                Kota Factory
                ...          
8802                   Zodiac
8803              Zombie Dumb
8804               Zombieland
8805                     Zoom
8806                   Zubaan
Name: title, Length: 8807, dtype: object

In [21]:
# This cell creates a list of titles, where each entry is itself a list of the words in the title of each item.

split_titles_list = []

for i in range(len(all_titles_series)):
    split_title = all_titles_series[i].split()
    split_titles_list.append(split_title)
    
split_titles_list

[['Dick', 'Johnson', 'Is', 'Dead'],
 ['Blood', '&', 'Water'],
 ['Ganglands'],
 ['Jailbirds', 'New', 'Orleans'],
 ['Kota', 'Factory'],
 ['Midnight', 'Mass'],
 ['My', 'Little', 'Pony:', 'A', 'New', 'Generation'],
 ['Sankofa'],
 ['The', 'Great', 'British', 'Baking', 'Show'],
 ['The', 'Starling'],
 ['Vendetta:', 'Truth,', 'Lies', 'and', 'The', 'Mafia'],
 ['Bangkok', 'Breaking'],
 ['Je', 'Suis', 'Karl'],
 ['Confessions', 'of', 'an', 'Invisible', 'Girl'],
 ['Crime', 'Stories:', 'India', 'Detectives'],
 ['Dear', 'White', 'People'],
 ["Europe's", 'Most', 'Dangerous', 'Man:', 'Otto', 'Skorzeny', 'in', 'Spain'],
 ['Falsa', 'identidad'],
 ['Intrusion'],
 ['Jaguar'],
 ['Monsters', 'Inside:', 'The', '24', 'Faces', 'of', 'Billy', 'Milligan'],
 ['Resurrection:', 'Ertugrul'],
 ['Avvai', 'Shanmughi'],
 ['Go!', 'Go!', 'Cory', 'Carson:', 'Chrissy', 'Takes', 'the', 'Wheel'],
 ['Jeans'],
 ['Love', 'on', 'the', 'Spectrum'],
 ['Minsara', 'Kanavu'],
 ['Grown', 'Ups'],
 ['Dark', 'Skies'],
 ['Paranoia'],
 ['Ank

In [22]:
# This cell makes all letters in titles lowercase to make matching words easier later.

lowercase_titles = []
for i in range(len(split_titles_list)):
    current_title = []
    for word in split_titles_list[i]:
        current_title.append(word.lower())
    lowercase_titles.append(current_title)
    
lowercase_titles

[['dick', 'johnson', 'is', 'dead'],
 ['blood', '&', 'water'],
 ['ganglands'],
 ['jailbirds', 'new', 'orleans'],
 ['kota', 'factory'],
 ['midnight', 'mass'],
 ['my', 'little', 'pony:', 'a', 'new', 'generation'],
 ['sankofa'],
 ['the', 'great', 'british', 'baking', 'show'],
 ['the', 'starling'],
 ['vendetta:', 'truth,', 'lies', 'and', 'the', 'mafia'],
 ['bangkok', 'breaking'],
 ['je', 'suis', 'karl'],
 ['confessions', 'of', 'an', 'invisible', 'girl'],
 ['crime', 'stories:', 'india', 'detectives'],
 ['dear', 'white', 'people'],
 ["europe's", 'most', 'dangerous', 'man:', 'otto', 'skorzeny', 'in', 'spain'],
 ['falsa', 'identidad'],
 ['intrusion'],
 ['jaguar'],
 ['monsters', 'inside:', 'the', '24', 'faces', 'of', 'billy', 'milligan'],
 ['resurrection:', 'ertugrul'],
 ['avvai', 'shanmughi'],
 ['go!', 'go!', 'cory', 'carson:', 'chrissy', 'takes', 'the', 'wheel'],
 ['jeans'],
 ['love', 'on', 'the', 'spectrum'],
 ['minsara', 'kanavu'],
 ['grown', 'ups'],
 ['dark', 'skies'],
 ['paranoia'],
 ['ank

In [23]:
# This cell removes colons from titles and any words following a title.  This is necessary for word matching, and the
# removal of subtitles does not make a significant impact on the results.  The probably should remain in an optimal program,
# but this function is copied from a data cleaning function that will appear later that I wrote first, and I didn't feel
# it was worth my time to change the code.

full_titles_list = []
for i in range(len(lowercase_titles)):
    current_item = lowercase_titles[i]
    words = len(current_item)
    j = 0
    while j < words:
        if ":" in current_item[j]:
            current_item[j] = current_item[j].replace(":", "")
            full_titles_list.append(current_item[0:j+1])
            j = words
            # If there is a colon, take only the words before the colon
        else:
            j += 1
            # Move to next word
        if (j == words-1) == True:
            full_titles_list.append(current_item[0:j+1])
            j = words

full_titles_list

[['dick', 'johnson', 'is', 'dead'],
 ['blood', '&', 'water'],
 ['jailbirds', 'new', 'orleans'],
 ['kota', 'factory'],
 ['midnight', 'mass'],
 ['my', 'little', 'pony'],
 ['the', 'great', 'british', 'baking', 'show'],
 ['the', 'starling'],
 ['vendetta'],
 ['bangkok', 'breaking'],
 ['je', 'suis', 'karl'],
 ['confessions', 'of', 'an', 'invisible', 'girl'],
 ['crime', 'stories'],
 ['dear', 'white', 'people'],
 ["europe's", 'most', 'dangerous', 'man'],
 ['falsa', 'identidad'],
 ['monsters', 'inside'],
 ['resurrection'],
 ['avvai', 'shanmughi'],
 ['go!', 'go!', 'cory', 'carson'],
 ['love', 'on', 'the', 'spectrum'],
 ['minsara', 'kanavu'],
 ['grown', 'ups'],
 ['dark', 'skies'],
 ['ankahi', 'kahaniya'],
 ['chicago', 'party', 'aunt'],
 ['sex', 'education'],
 ['squid', 'game'],
 ['tayo', 'and', 'little', 'wizards'],
 ['the', 'father', 'who', 'moves', 'mountains'],
 ['the', 'stronghold'],
 ['angry', 'birds'],
 ['birth', 'of', 'the', 'dragon'],
 ['chhota', 'bheem'],
 ['he-man', 'and', 'the', 'maste

In [24]:
# Creates list of "stop words."  These are common words that don't convey much information and might mess with our ranking
# system that we will develop shortly.

stop_words = nltk.corpus.stopwords.words('english')

In [25]:
# Removes stop words from titles

short_titles_list = full_titles_list
for word in stop_words:
    for i in range(len(short_titles_list)):
        for title_word in short_titles_list[i]:
            if title_word == word:
                short_titles_list[i].remove(word)

short_titles_list

[['dick', 'johnson', 'dead'],
 ['blood', '&', 'water'],
 ['jailbirds', 'new', 'orleans'],
 ['kota', 'factory'],
 ['midnight', 'mass'],
 ['little', 'pony'],
 ['great', 'british', 'baking', 'show'],
 ['starling'],
 ['vendetta'],
 ['bangkok', 'breaking'],
 ['je', 'suis', 'karl'],
 ['confessions', 'invisible', 'girl'],
 ['crime', 'stories'],
 ['dear', 'white', 'people'],
 ["europe's", 'dangerous', 'man'],
 ['falsa', 'identidad'],
 ['monsters', 'inside'],
 ['resurrection'],
 ['avvai', 'shanmughi'],
 ['go!', 'go!', 'cory', 'carson'],
 ['love', 'spectrum'],
 ['minsara', 'kanavu'],
 ['grown', 'ups'],
 ['dark', 'skies'],
 ['ankahi', 'kahaniya'],
 ['chicago', 'party', 'aunt'],
 ['sex', 'education'],
 ['squid', 'game'],
 ['tayo', 'little', 'wizards'],
 ['father', 'moves', 'mountains'],
 ['stronghold'],
 ['angry', 'birds'],
 ['birth', 'dragon'],
 ['chhota', 'bheem'],
 ['he-man', 'masters', 'universe'],
 ['jaws', '2'],
 ['jaws', '3'],
 ['jaws'],
 ['heroes', 'cowboys'],
 ['safe', 'house'],
 ['smart'

In [26]:
# This is the function contained in the data_cleaning_function.ipynb file.  It takes a Netflix veiwing history file as an
# input and returns a list of shows/movies the user has watched, a list of the number of episodes of each item watched (it
# returns 1 for movies), and a list of the average number of days since watching an episode of each item.  This information
# is all we use to make the rankings that this file returns.

def clean_history(viewing_history):
    split_titles_list = []

    for i in range(len(viewing_history)):
        split_title = viewing_history.iloc[i,0].split()
        split_titles_list.append(split_title)
    
    items_watched = len(split_titles_list)
    
    short_titles_list = []
    for i in range(items_watched):
        current_item = split_titles_list[i]
        words = len(current_item)
        j = 0
        while j < words:
            if ":" in current_item[j]:
                current_item[j] = current_item[j].replace(":", "")
                short_titles_list.append(current_item[0:j+1])
                j = words
                # If there is a colon, take only the words before the colon
            else:
                j += 1
                # Move to next word
            if (j == words-1) == True:
                short_titles_list.append(current_item[0:j+1])
                j = words
                # If there is no colon by the end of the word, take the whole title (it's a movie and doesn't need shortening)
                

    unique_titles = []
    for i in range(items_watched):
        if short_titles_list[i] not in unique_titles:
            unique_titles.append(short_titles_list[i])
            
    lowercase_titles = []
    for i in range(len(unique_titles)):
        current_title = []
        for word in unique_titles[i]:
            current_title.append(word.lower())
        lowercase_titles.append(current_title)
    
    stop_words = nltk.corpus.stopwords.words('english')
    for word in stop_words:
        for i in range(len(lowercase_titles)):
            for title_word in lowercase_titles[i]:
                if title_word == word:
                    lowercase_titles[i].remove(word)

    number_episodes = []
    for title in unique_titles:
        title_count = 0
        for title_2 in short_titles_list:
            if title == title_2:
                title_count += 1
        number_episodes.append(title_count)
        
    view_dates = []
    for i in range(len(viewing_history)):
        view_date_list = viewing_history.iloc[i,1].split("/")
        view_dates.append(view_date_list)

    days_since_viewing = []
    today = date.today()
    for j in range(len(view_dates)):
        view_date = date(int(view_dates[j][2])+2000, int(view_dates[j][0]), int(view_dates[j][1]))
        delta = today - view_date
        days_since_viewing.append(delta.days)
        
    average_days_since_viewing = []
    for title in unique_titles:
        duplicate_title_index_list = []
        for i in range(items_watched):
            if short_titles_list[i] == title:
                duplicate_title_index_list.append(i)
        days_since_viewing_all_duplicates = []
        for j in range(len(duplicate_title_index_list)):
            days_since_viewing_all_duplicates.append(days_since_viewing[duplicate_title_index_list[j]])
        avg = np.mean(days_since_viewing_all_duplicates)
        average_days_since_viewing.append(avg)
        
    return lowercase_titles, number_episodes, average_days_since_viewing

In [27]:
# THIS IS THE CELL THAT CHANGES FOR OTHER HISTORY FILES!!!
# This cell reads in the specified Netflix history csv and runs the function that extracts the necessary information
# from it.

viewing_history_jarrett = pd.read_csv("JarrettNetflixViewingHistory.csv")
viewing_history_jarrett

history_titles, history_number_episodes, history_average_days = clean_history(viewing_history_jarrett)

In [28]:
# Creates index list of shows watched and the Netflix library to make looping easier.

history_index_list = []
for i in range(len(history_titles)):
    history_index_list.append(i)

items_index_list = []
for j in range(len(short_titles_list)):
    items_index_list.append(j)

print(history_index_list)
print(items_index_list)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,

In [29]:
# This cell is "the algorithm" of this file.  It creates and populates a dictionary that contains the index corresponding
# to each show/movie a user has watched as its keys.  For its values, it creates another dictionary containing all
# shows/movies in the Netflix library that share a word in the title of the show in the user's watch history as its keys
# and a weight score based on the number of episodes watched and average days since watching of the series that generated
# the corresponding reccomendation. 

# In simplified terms, it searches for similarly titled shows to the user's history in the Netflix library and assigns
# them a score for how much it thinks the user would like it based on how much and how recently the user watched the item
# that generated the reccomendation.

viewed_item_dict = {}

for i in range(len(history_titles)):
    title_values_dict = {}
    null_dict = {"None":"None"}
    for history_word in history_titles[i]:
        for j in range(len(short_titles_list)):
            for item_word in short_titles_list[j]:
                if history_word == item_word:
                    title_values_dict[j] = history_number_episodes[i]/history_average_days[i]
                    viewed_item_dict[i] = title_values_dict
            if i not in viewed_item_dict:
                viewed_item_dict[i] = null_dict

viewed_item_dict

{0: {1483: 0.38667929292929293,
  3098: 0.38667929292929293,
  3437: 0.38667929292929293,
  3689: 0.38667929292929293,
  3814: 0.38667929292929293,
  3947: 0.38667929292929293,
  4353: 0.38667929292929293,
  4610: 0.38667929292929293,
  6021: 0.38667929292929293},
 1: {1483: 0.10440456769983687,
  3098: 0.10440456769983687,
  3437: 0.10440456769983687,
  3689: 0.10440456769983687,
  3814: 0.10440456769983687,
  3947: 0.10440456769983687,
  4353: 0.10440456769983687,
  4610: 0.10440456769983687,
  6021: 0.10440456769983687},
 2: {96: 0.46967866634452765,
  499: 0.46967866634452765,
  2759: 0.46967866634452765,
  3550: 0.46967866634452765,
  3557: 0.46967866634452765,
  3573: 0.46967866634452765,
  3598: 0.46967866634452765,
  3994: 0.46967866634452765,
  4049: 0.46967866634452765,
  4251: 0.46967866634452765,
  4290: 0.46967866634452765,
  4291: 0.46967866634452765,
  4520: 0.46967866634452765,
  4613: 0.46967866634452765,
  4684: 0.46967866634452765,
  5102: 0.46967866634452765,
  6550

In [30]:
# This cell converts the information in the dictionary created above into two lists; one containts the reccomended show
# indecies and the other contains the scores for the corresponding shows.

reccomended_index_list = []
reccomended_score_list = []
for i in range(len(viewed_item_dict)):
    keys = list(viewed_item_dict[i].keys())
    scores = list(viewed_item_dict[i].values())
    for j in range(len(keys)):
        if keys[j] not in reccomended_index_list:
            if type(keys[j]) == int:
                reccomended_index_list.append(keys[j])
                reccomended_score_list.append(scores[j])
            
print(reccomended_index_list)
print(reccomended_score_list)

[1483, 3098, 3437, 3689, 3814, 3947, 4353, 4610, 6021, 96, 499, 2759, 3550, 3557, 3573, 3598, 3994, 4049, 4251, 4290, 4291, 4520, 4613, 4684, 5102, 6550, 6551, 20, 130, 131, 192, 208, 315, 411, 429, 518, 550, 552, 610, 628, 662, 728, 858, 895, 1009, 1104, 1130, 1186, 1237, 1272, 1335, 1390, 1421, 1422, 1449, 1538, 1569, 1637, 1692, 1805, 1931, 2031, 2089, 2158, 2159, 2161, 2163, 2175, 2191, 2234, 2276, 2340, 2384, 2413, 2423, 2430, 2438, 2446, 2459, 2515, 2633, 2734, 2787, 2822, 2831, 2838, 2848, 2852, 2942, 3006, 3024, 3048, 3049, 3264, 3275, 3277, 3313, 3325, 3333, 3338, 3359, 3416, 3435, 3438, 3507, 3518, 3528, 3585, 3602, 3631, 3682, 3823, 3871, 4096, 4097, 4120, 4147, 4381, 4451, 4559, 4567, 4572, 4648, 4677, 4678, 4679, 4680, 4709, 4710, 4713, 4872, 4955, 4956, 4985, 5004, 5136, 5145, 5268, 5391, 5445, 5532, 5534, 5558, 5743, 5746, 5955, 5967, 5968, 5969, 5970, 5971, 5972, 5973, 5974, 5975, 5976, 5977, 6204, 6253, 6405, 6845, 6882, 7006, 7013, 803, 991, 998, 1690, 1816, 1980, 213

In [31]:
# This cell orders the reccomended shows such that those with the highest rankings occur first on the list and those with
# the lowest occur last.

ordered_reccomended_index = []
ordered_reccomended_score = []
for i in range(len(reccomended_score_list)):
    max_score = max(reccomended_score_list)
    top_index = reccomended_score_list.index(max_score)
    ordered_reccomended_index.append(reccomended_index_list[top_index])
    ordered_reccomended_score.append(reccomended_score_list[top_index])
    reccomended_index_list.remove(reccomended_index_list[top_index])
    reccomended_score_list.remove(reccomended_score_list[top_index])
    
print(ordered_reccomended_index)
print(ordered_reccomended_score)

[96, 499, 2759, 3550, 3557, 3573, 3598, 3994, 4049, 4251, 4290, 4291, 4520, 4613, 4684, 5102, 6550, 6551, 1483, 3098, 3437, 3689, 3814, 3947, 4353, 4610, 6021, 1184, 1505, 20, 130, 131, 192, 208, 315, 411, 429, 518, 550, 552, 610, 628, 662, 728, 858, 895, 1009, 1104, 1130, 1186, 1237, 1272, 1335, 1390, 1421, 1422, 1449, 1538, 1569, 1637, 1692, 1805, 1931, 2031, 2089, 2158, 2159, 2161, 2163, 2175, 2191, 2234, 2276, 2340, 2384, 2413, 2423, 2430, 2438, 2446, 2459, 2515, 2633, 2734, 2787, 2822, 2831, 2838, 2848, 2852, 2942, 3006, 3024, 3048, 3049, 3264, 3275, 3277, 3313, 3325, 3333, 3338, 3359, 3416, 3435, 3438, 3507, 3518, 3528, 3585, 3602, 3631, 3682, 3823, 3871, 4096, 4097, 4120, 4147, 4381, 4451, 4559, 4567, 4572, 4648, 4677, 4678, 4679, 4680, 4709, 4710, 4713, 4872, 4955, 4956, 4985, 5004, 5136, 5145, 5268, 5391, 5445, 5532, 5534, 5558, 5743, 5746, 5955, 5967, 5968, 5969, 5970, 5971, 5972, 5973, 5974, 5975, 5976, 5977, 6204, 6253, 6405, 6845, 6882, 7006, 7013, 803, 991, 998, 1690, 181

In [32]:
# Finally, this cell prints out the names of the shows/movies it reccomends to the user in the order that it reccomends
# them based on their scores.  Again, note that this method only used the information contained in the Netflix history
# csv, so it cannot account for things like genre.  Because of this, it makes some good predictions and some asinine
# predictions.  In this case, for example, it correctly predicts that I would like Star Trek and Star Wars, but its
# thinks that the word "star" is what causes me to like them, so it also reccomends nonsense like "Barbie Star Light
# Adventure."  Another thing to note is that this does not rank every item in the Netflix library.  If a show contains
# no common words in the title with anything that I have watched, the it is deemed too different from anything I've watched
# and not given a score.

final_reccomendations = []
for i in range(len(ordered_reccomended_index)):
    final_reccomendations.append(full_titles_list[ordered_reccomended_index[i]])
    
final_reccomendations

[['bright', 'star'],
 ['star', 'trek'],
 ['holly', 'star'],
 ['5', 'star', 'christmas'],
 ['super', 'monsters', 'wish', 'star'],
 ['look', 'star'],
 ['puppy', 'star', 'christmas'],
 ['pup', 'star'],
 ['star', 'trek'],
 ['pup', 'star'],
 ['star', 'trek'],
 ['star', 'trek'],
 ['frat', 'star'],
 ['star', 'trek'],
 ['pup', 'star'],
 ['barbie', 'star', 'light', 'adventure'],
 ['star', 'men'],
 ['star', 'wars'],
 ["marvel's", 'agents', 's.h.i.e.l.d.'],
 ["marvel's", 'jessica', 'jones'],
 ["marvel's", 'punisher'],
 ["marvel's", 'daredevil'],
 ["marvel's", 'iron', 'fist'],
 ["marvel's", 'luke', 'cage'],
 ["marvel's", 'defenders'],
 ["marvel's", 'hulk'],
 ["marvel's", 'iron', 'man', '&', 'hulk'],
 ['creating', "queen's", 'gambit'],
 ["queen's", 'gambit'],
 ['love', 'spectrum'],
 ['love', 'cost', 'thing'],
 ['love', 'puff'],
 ['really', 'love'],
 ['man', 'love'],
 ['resort', 'love'],
 ['lethal', 'love'],
 ['little', 'love', 'mine'],
 ['fools', 'fall', 'love'],
 ['love', 'story'],
 ['bangkok', 'l