# Modeling & Data Storage

In [2]:
#imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [5]:
#reading in data
df = pd.read_csv('data/all_platforms_clean_for_modeling.csv',index_col=0)

In [6]:
df.head()

Unnamed: 0,movie,year,age_rating,imdb_fan_score,rt_critic_score,genres,country,language,movie_len_mins,rt_audience_score,actors,desc,sub_genre,top_genre,directors_ordinal
0,Inception,2010,3.0,0.88,0.87,"Action,Adventure,Sci-Fi,Thriller",United States,English,148.0,0.91,"['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...",,Adventure,Action,4.0
1,The Matrix,1999,5.0,0.87,0.87,"Action,Sci-Fi",United States,English,136.0,0.85,"['Keanu Reeves', 'Laurence Fishburne', 'Carrie...",,Sci-Fi,Action,5.0
2,Avengers: Infinity War,2018,3.0,0.85,0.84,"Action,Adventure,Sci-Fi",United States,English,149.0,0.91,"['Robert Downey Jr.', 'Chris Hemsworth', 'Mark...",,Adventure,Action,5.0
3,Back to the Future,1985,2.0,0.85,0.96,"Adventure,Comedy,Sci-Fi",United States,English,116.0,0.94,"['Michael J. Fox', 'Christopher Lloyd', 'Lea T...",,Comedy,Adventure,8.0
4,"The Good, the Bad and the Ugly",1966,5.0,0.88,0.97,Western,Italy,Italian,161.0,0.97,"['Eli Wallach', 'Clint Eastwood', 'Lee Van Cle...",,,Western,2.0


## Cleaning and Preparing for Modeling

In [7]:
#cleaning up formating of actors feature
df['actors'] = df['actors'].str.replace('[','')

In [8]:
df['actors'] = df['actors'].str.replace(']','')

In [9]:
df['actors'] = df['actors'].str.replace("'","")

In [11]:
#function that returns either the top actor or NaN for each movie
def lead_actor(actors):
    try:
        return actors.split(',')[0]
    except:
        return np.nan

In [12]:
#applying function to `actors` to create new feature `lead_actor`
df['lead_actor'] = df['actors'].apply(lead_actor)

## Modeling

In [70]:
# creating mask for all cvec objects
mask = (text['lead_actor'].isna() == False) & (text['lead_actor']!= 'no actor data') & (text['directors_ordinal'].isna() == False) & (text['top_genre'].isna() == False)

In [73]:
# fitting cvec model on description data
text = df[['desc','movie','lead_actor','directors_ordinal','top_genre']][df['desc'].isna() == False].set_index('movie')
X = text['desc'][mask]
cvec = CountVectorizer()
X = cvec.fit_transform(X)

In [74]:
#creating cvec df with description data
cvec_df = pd.DataFrame(X.todense(),columns=cvec.get_feature_names())

In [75]:
cvec_df.head()

Unnamed: 0,000,000th,10,100,1000,100th,101,102,108,10th,...,zulfi,zuniga,zurich,zus,zz,âeurâ,çanakkale,çinar,çok,özay
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
#creating second cvec object with lead_actor data
X2 = text['lead_actor'][mask]
X2 = cvec.fit_transform(X2)

In [77]:
#creating cvec df for second cvec object to combine with description data
cvec_df2 = pd.DataFrame(X2.todense(),columns=cvec.get_feature_names())

In [78]:
cvec_df2.head()

Unnamed: 0,2na,2pac,3000,50,aadhi,aadi,aakash,aames,aamir,aangrish,...,émilie,éric,ólafsson,ólafur,óscar,ôtani,öykü,özyurtlu,ümit,þóra
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
#combining both description and actor cvec df
combined_df = pd.concat([cvec_df,cvec_df2],axis=1)

In [79]:
#creating df for directors that align with cvec df
direct = text['directors_ordinal'][mask]

In [82]:
#setting index so the data matches up
direct.index = range(0,8322)

In [83]:
#combining director data with description and actor data
combined_df = pd.concat([combined_df,direct],axis=1)

In [84]:
#creating cvec object for genre data
X3 = text['top_genre'][mask]
X3 = cvec.fit_transform(X3)

In [85]:
#creating cvec df for genre data
cvec_df3 = pd.DataFrame(X3.todense(),columns=cvec.get_feature_names())

In [86]:
cvec_df3.head()

Unnamed: 0,action,adventure,animation,biography,comedy,crime,documentary,drama,family,fantasy,...,romance,sci,short,show,sport,talk,thriller,tv,war,western
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
#combining all data df with genre data df
combined_df = pd.concat([combined_df,cvec_df3],axis=1)

In [88]:
#creating cosine similarity table and recommendation df for dataset
table = sparse.csr_matrix(combined_df)
similarity_table = cosine_similarity(table)
recs = pd.DataFrame(similarity_table,columns=text[mask].index,
                    index=text[mask].index)
recs.head()

movie,Jim Gaffigan: Mr. Universe,God's Not Dead,Don't Go Breaking My Heart,Zero,24 Hours to Live,Dream Boat,A Family Man,Into the Inferno,The Nutcracker and the Four Realms,Frank & Lola,...,Flesh Eating Mothers,Armaan,Flight to Hong Kong,Cut!,Making Marines,"Krakatoa, East of Java",Boppin' at the Glue Factory,Caged,In the Blood,Kilometre Zero
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jim Gaffigan: Mr. Universe,1.0,0.351615,0.619898,0.203316,0.244451,0.202285,0.321025,0.916554,0.745992,0.353161,...,0.255031,0.188598,0.212566,0.406714,0.158299,0.448219,0.275086,0.447175,0.765705,0.195362
God's Not Dead,0.351615,1.0,0.324967,0.176304,0.095893,0.112594,0.154266,0.348481,0.3045,0.226278,...,0.087538,0.177559,0.051503,0.145671,0.065202,0.260025,0.103005,0.243778,0.399491,0.185695
Don't Go Breaking My Heart,0.619898,0.324967,1.0,0.207687,0.161374,0.212219,0.222522,0.599479,0.503115,0.261116,...,0.206239,0.209165,0.173344,0.318689,0.131671,0.29756,0.208013,0.401129,0.566139,0.166667
Zero,0.203316,0.176304,0.207687,1.0,0.091928,0.331012,0.380282,0.185596,0.300755,0.247911,...,0.111891,0.354619,0.263323,0.209472,0.208353,0.348984,0.098746,0.34622,0.134377,0.237356
24 Hours to Live,0.244451,0.095893,0.161374,0.091928,1.0,0.031311,0.091928,0.269191,0.23094,0.13484,...,0.121716,0.061721,0.035806,0.101274,0.06044,0.14462,0.143223,0.150649,0.233882,0.043033


In [306]:
#checking cosine similarity scores for test case
recs['Jim Gaffigan: Mr. Universe'].sort_values(ascending=False)[1:11]

movie
Todd Barry: Spicy Honey             0.946077
Chris Porter: Ugly and Angry        0.945417
Bob Saget: Zero to Sixty            0.945386
Tom Segura: Completely Normal       0.944648
Kathleen Madigan: Madigan Again     0.944275
Brad Paisley's Comedy Rodeo         0.943055
D.L. Hughley: Clear                 0.942870
Chinatown Kid                       0.942827
Jim Breuer: And Laughter for All    0.942700
Lucas Brothers: On Drugs            0.942528
Name: Jim Gaffigan: Mr. Universe, dtype: float64

In [42]:
#func that returns top ten values of a series and their index
def recommend(movie,rec_df):
    print('Here are some movies you might like:')
    top_ten = rec_df[movie].sort_values(ascending=False)[1:11].index.to_list()
    for i in range(10):
        print(f'{i+1}. {top_ten[i]}')

In [69]:
#examining recs for test case to see if they make sense
recommend("Jim Gaffigan: Mr. Universe",recs)

Here are some movies you might like:
1. Todd Barry: Spicy Honey
2. Chris Porter: Ugly and Angry
3. Bob Saget: Zero to Sixty
4. Kathleen Madigan: Madigan Again
5. Tom Segura: Completely Normal
6. Brad Paisley's Comedy Rodeo
7. Chinatown Kid
8. D.L. Hughley: Clear
9. Lucas Brothers: On Drugs
10. Todd Glass Stand-Up Special


# Data Storage in JSON files

## Creating JSON containing 10 recs for Each Movie in Dataset
Also contains links to each of the platforms the title is on.

In [140]:
#function that creates json containing movie title and 10 ten recommendations
def get_recs_metadata(df):
    json = {}
    data = []
    all_movies = df.columns.to_list()
    for movie in all_movies:
        movie_meta = {}
        top_ten = df[movie].sort_values(ascending=False)[1:11].index.to_list()
        movie_meta['name'] = movie
        movie_meta['recs'] = top_ten
        data.append(movie_meta)
    json['data'] = data
    return json

In [141]:
#creating metadata dictionary
metadata = get_recs_metadata(recs)

## Collecting Streaming Links -- DID NOT INCLUDE ON WEBSITE

In [160]:
#creating list of movies included in recommendations df
movies = recs.columns.to_list()

In [101]:
#loading in data with streaming data info
streaming_data = pd.read_csv('data/total_webscraped.csv',index_col=0)

In [106]:
#filtering data to only include movies in recommendations df
streaming_data = streaming_data[['movie','netflix','hulu','prime_video','disney_plus']][df['movie'].isin(movies)]

In [107]:
streaming_data.head()

Unnamed: 0,movie,netflix,hulu,prime_video,disney_plus
0,Jim Gaffigan: Mr. Universe,1,0,0,0
1,God's Not Dead,1,0,1,0
3,Don't Go Breaking My Heart,1,0,0,0
4,Zero,1,0,1,0
5,24 Hours to Live,1,0,0,0


In [114]:
#function that searches for movie links on each streaming platform for every movie in the
#recommendations df
def platform_link(df):
    from googlesearch import search
    
    streaming_data = []
    for index, row in df.iterrows():
        movie = {}
        if row['netflix'] == 1:
            for link in search(f'{row["movie"]} netflix',stop=1):
                movie['netflix'] = link
        if row['hulu'] == 1:
            for link in search(f'{row["movie"]} hulu',stop=1):
                movie['hulu'] = link
        if row['prime_video'] == 1:
            for link in search(f'{row["movie"]} prime video',stop=1):
                movie['prime_video'] = link
        if row['disney_plus'] == 1:
            for link in search(f'{row["movie"]} disney plus',stop=1):
                movie['disney_plus'] = link
        streaming_data.append({row['movie']:movie})
    return streaming_data

In [120]:
#timing sample from dataset to estimate how long it would take for the entire dataset
%%time
test = platform_link(streaming_data.sample(10,random_state=21))

CPU times: user 625 ms, sys: 57.8 ms, total: 683 ms
Wall time: 36.5 s


In [145]:
#creating list of unique movie titles to check how long it would take to search all links
tops = metadata['data']
unique_movies = []
for entry in tops:
    movies = entry['recs']
    for title in movies:
        if title not in unique_movies:
            unique_movies.append(title)

In [148]:
#computed time in days
len(unique_movies)*36.5/60/60/24

2.282939814814815

## Checking Which Streaming Platforms Each Movie is On

In [245]:
#reseting index of streaming data to avoid overwriting in dictionary with index as key
streaming_data.reset_index(drop=True,inplace=True)

In [246]:
#creating list of indexes in the df
movie_idxs = streaming_data.index.to_list()

In [247]:
#turning the df into a dictionary to more easily extract metadata
streaming_data_dict = streaming_data.to_dict()

In [249]:
#creating dictionary that contains movie title as key and list of streaming platforms as value
movies_with_streaming_data = {}
for movie in movie_idxs:
    movie_platforms = []
    if streaming_data_dict['netflix'][movie] == 1:
        movie_platforms.append('netflix')
    if streaming_data_dict['hulu'][movie] == 1:
        movie_platforms.append('hulu')
    if streaming_data_dict['prime_video'][movie] == 1:
        movie_platforms.append('prime_video')
    if streaming_data_dict['disney_plus'][movie] == 1:
        movie_platforms.append('disney_plus')
    movies_with_streaming_data[streaming_data_dict['movie'][movie]] = movie_platforms

## Combining Metadata

In [149]:
import json

In [255]:
#What I Need
# 1. movies_with_streaming_data -- > movie: platforms
# 2. metadata --> movie: recs
# 3. new_dict --> movie: genre
# 4. new_dict_2 --> movie: description

In [263]:
#adding platforms dict data to metadata dictionary
for movie in metadata['data']:
    movie['platforms'] = movies_with_streaming_data[movie['name']]

In [268]:
#reseting index of above df to avoid dictionary overwriting issues
popular.reset_index(drop=True,inplace=True)

In [271]:
#creating dictionary for genre metadata that takes movie as key and genre as value
genre_dict_meta = {}
vals = popular[['movie','top_genre']]
indexes = vals.index.to_list()
vals_dict = vals.to_dict()
for idx in indexes:
    genre_dict_meta[vals_dict['movie'][idx]] =  vals_dict['top_genre'][idx]

In [274]:
#adding genre metadata to main metadata dictionary
for movie in metadata['data']:
    movie['genre'] = genre_dict_meta[movie['name']]

In [292]:
#creating df that contains subgenre and description to add to metadata dict
desc_plus_sub = genre[['movie','desc','sub_genre']][genre['movie'].isin(movies)]

In [294]:
#reseting index to avoid overwrite issue
desc_plus_sub.reset_index(drop=True,inplace=True)

In [296]:
#creating dictionary for description metadata with movie as key and description as the value
desc_dict_meta = {}
indexes = desc_plus_sub.index.to_list()
vals_dict = desc_plus_sub.to_dict()
for idx in indexes:
    desc_dict_meta[vals_dict['movie'][idx]] = vals_dict['desc'][idx]

In [298]:
#adding description metadata to main metadata dictionary
for movie in metadata['data']:
    movie['desc'] = desc_dict_meta[movie['name']]

In [300]:
#creating dictionary for sub genre metadata with movie as key and sub genre as value
sub_dict_meta = {}
indexes = desc_plus_sub.index.to_list()
vals_dict = desc_plus_sub.to_dict()
for idx in indexes:
    sub_dict_meta[vals_dict['movie'][idx]] = vals_dict['sub_genre'][idx]

In [302]:
#adding sub genre metadata to main metadata dictionary
for movie in metadata['data']:
    movie['sub_genre'] = sub_dict_meta[movie['name']]

In [303]:
#checking first entry to ensure data is correct
metadata['data'][0]

{'name': 'Jim Gaffigan: Mr. Universe',
 'recs': ['Todd Barry: Spicy Honey',
  'Chris Porter: Ugly and Angry',
  'Bob Saget: Zero to Sixty',
  'Tom Segura: Completely Normal',
  'Kathleen Madigan: Madigan Again',
  "Brad Paisley's Comedy Rodeo",
  'D.L. Hughley: Clear',
  'Chinatown Kid',
  'Jim Breuer: And Laughter for All',
  'Lucas Brothers: On Drugs'],
 'platforms': ['netflix'],
 'genre': 'Documentary',
 'desc': 'Jim Gaffigan does it again with his new stand-up Mr. Universe. Gaffigan jumps from topic to topic joking around with a varied number of topics. Some of the topics included are McDonalds, family, Disney World, and much more.',
 'sub_genre': 'Comedy'}

In [304]:
#exporting the dictionary to a json file for Flask app
with open('metadata.json', 'w') as outfile:
    json.dump(metadata, outfile)