In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date

In [2]:
# Read in viewing history

viewing_history = pd.read_csv("NetflixViewingHistory.csv")
viewing_history

Unnamed: 0,Title,Date
0,Marvel's Daredevil: Season 3: Revelations,2/28/22
1,Marvel's Daredevil: Season 3: Upstairs/Downstairs,2/28/22
2,Marvel's Daredevil: Season 3: Aftermath,2/24/22
3,Marvel's Daredevil: Season 3: The Devil You Know,2/24/22
4,Marvel's Daredevil: Season 3: The Perfect Game,2/21/22
...,...,...
160,The Social Dilemma,10/26/20
161,The West Wing: Season 1: Pilot,10/26/20
162,The Universe: Season 2: Alien Planets,10/26/20
163,Jeopardy!: Seth Wilson Collection: Episode #7361,4/22/20


In [3]:
# Create list of titles where each titles is a list of strings 

split_titles_list = []

for i in range(len(viewing_history)):
    split_title = viewing_history.iloc[i,0].split()
    split_titles_list.append(split_title)
    
split_titles_list

[["Marvel's", 'Daredevil:', 'Season', '3:', 'Revelations'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'Upstairs/Downstairs'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'Aftermath'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'The', 'Devil', 'You', 'Know'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'The', 'Perfect', 'Game'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'Blindsided'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'No', 'Good', 'Deed'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'Please'],
 ["Marvel's", 'Daredevil:', 'Season', '3:', 'Resurrection'],
 ["Marvel's", 'The', 'Defenders:', 'Limited', 'Series:', 'The', 'Defenders'],
 ["Marvel's",
  'The',
  'Defenders:',
  'Limited',
  'Series:',
  'Fish',
  'in',
  'the',
  'Jailhouse'],
 ["Marvel's", 'The', 'Defenders:', 'Limited', 'Series:', 'Ashes,', 'Ashes'],
 ["Marvel's", 'The', 'Defenders:', 'Limited', 'Series:', 'Take', 'Shelter'],
 ["Marvel's", 'The', 'Defenders:', 'Limited', 'Series:', 'Royal', 'Dragon'],
 ["Marve

In [4]:
# Create variable with number of shows/movies watched; used for indexing later

items_watched = len(split_titles_list)
items_watched

165

In [5]:
# Create list of shortened titles (removes season numbers and episode names from series)

short_titles_list = []
for i in range(items_watched):
    current_item = split_titles_list[i]
    words = len(current_item)
    j = 0
    while j < words:
        if ":" in current_item[j]:
            current_item[j] = current_item[j].replace(":", "")
            short_titles_list.append(current_item[0:j+1])
            j = words
            # If there is a colon, take only the words before the colon
        else:
            j += 1
            # Move to next word
        if (j == words-1) == True:
            short_titles_list.append(current_item[0:j+1])
            j = words
            # If there is no colon by the end of the word, take the whole title (it's a movie and doesn't need shortening)
        
short_titles_list

[["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'The', 'Defenders'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Daredevil'],
 ["Marvel's", 'Dared

In [6]:
# Making sure the correct number of entries made it through

print(len(short_titles_list))

165


In [7]:
# Create list of unique titles (i.e. only one entry per series) and list of episodes of each series watched (movies will
# only have one)
# Number of episodes watched per series could be used to create a bias so we are more likely to reccomend items similar to
# shows the viewer watched many episodes of instead of treating them equally to a single episode
# Dummy check to make sure lengths match
# We could combine these into a dictionary or 2D array if it makes life easier

unique_titles = []
for i in range(items_watched):
    if short_titles_list[i] not in unique_titles:
        unique_titles.append(short_titles_list[i])

number_episodes = []
for title in unique_titles:
    title_count = 0
    for title_2 in short_titles_list:
        if title == title_2:
            title_count += 1
    number_episodes.append(title_count)

print(unique_titles)
print("Length:", len(unique_titles))
print(number_episodes)
print("Length:", len(number_episodes))

[["Marvel's", 'Daredevil'], ["Marvel's", 'The', 'Defenders'], ['Star', 'Trek'], ['For', 'the', 'Love', 'of', 'Spock'], ['Get', 'Me', 'Roger', 'Stone'], ['The', "Queen's", 'Gambit'], ['The', 'Social', 'Dilemma'], ['The', 'West', 'Wing'], ['The', 'Universe'], ['Jeopardy!'], ['Tiger', 'King']]
Length: 11
[35, 8, 108, 1, 1, 7, 1, 1, 1, 1, 1]
Length: 11


In [8]:
# Calculate days since viewing each item; we have the information and might as well make a recency bias.  For example, the
# first viewing in my history is the piolt episode of Tiger King, which I watched once a long time ago and haven't since
# watched anything like it.  It should not have the same weight on my reccomendations as what I was watching 2 months ago.

view_dates = []
for i in range(len(viewing_history)):
    view_date_list = viewing_history.iloc[i,1].split("/")
    view_dates.append(view_date_list)

days_since_viewing = []
today = date.today()
for j in range(len(view_dates)):
    view_date = date(int(view_dates[j][2])+2000, int(view_dates[j][0]), int(view_dates[j][1]))
    delta = today - view_date
    days_since_viewing.append(delta.days)

print(days_since_viewing)

[38, 38, 42, 42, 45, 50, 52, 55, 57, 58, 59, 61, 61, 61, 61, 66, 66, 67, 67, 68, 69, 70, 70, 70, 73, 76, 76, 76, 78, 79, 93, 93, 93, 95, 95, 95, 95, 98, 99, 100, 104, 104, 104, 105, 112, 112, 113, 113, 114, 115, 119, 136, 138, 140, 140, 145, 146, 148, 149, 153, 153, 153, 155, 157, 163, 164, 164, 165, 166, 167, 168, 169, 173, 176, 177, 180, 183, 183, 185, 186, 189, 189, 190, 192, 195, 198, 199, 201, 204, 206, 207, 212, 215, 220, 224, 225, 225, 226, 227, 228, 231, 232, 233, 234, 234, 235, 236, 236, 238, 241, 241, 243, 244, 245, 247, 248, 250, 252, 253, 253, 255, 256, 258, 258, 259, 259, 262, 262, 262, 263, 264, 266, 268, 270, 273, 274, 275, 276, 277, 277, 279, 280, 280, 283, 283, 283, 285, 286, 289, 289, 375, 380, 381, 517, 520, 523, 524, 524, 524, 525, 528, 528, 528, 715, 715]


In [16]:
average_days_since_viewing = []
for title in unique_titles:
    duplicate_title_index_list = []
    for i in range(items_watched):
        if short_titles_list[i] == title:
            duplicate_title_index_list.append(i)
    days_since_viewing_all_duplicates = []
    for j in range(len(duplicate_title_index_list)):
        days_since_viewing_all_duplicates.append(days_since_viewing[duplicate_title_index_list[j]])
    avg = np.mean(days_since_viewing_all_duplicates)
    average_days_since_viewing.append(avg)

print(len(average_days_since_viewing))
average_days_since_viewing

11


[75.51428571428572,
 61.625,
 214.94444444444446,
 259.0,
 380.0,
 522.4285714285714,
 528.0,
 528.0,
 528.0,
 715.0,
 715.0]