# Data Modeling & Recommender

In [2]:
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
# read into model data
df = pd.read_csv('data/model_data.csv')

In [4]:
# drop unnamed column
df.drop(columns='Unnamed: 0', inplace=True)

In [5]:
df.head()

Unnamed: 0,title,description,language,categories,author,desc_length,cat_list,primary_cat,clean_desc,all_text
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,English,Technology,Practical Ecommerce,15,['Technology'],Technology,listen in as the practical ecommerce editorial...,"Ecommerce Conversations, by Practical Ecommerc..."
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,Telerik,59,"['Tech News', 'Technology']",Tech News,on the show we ll be talking to passionate peo...,Eat Sleep Code Podcast Telerik on the show we ...
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,Randy Andrews,11,"['Podcasting', 'Technology']",Podcasting,a podcast about soundtracks and movies from my...,SoundtrackAlley Randy Andrews a podcast about ...
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,Timothy Goddard,59,"['Business News', 'Technology', 'Tech News', '...",Business News,the tech m a podcast pulls from the best of th...,The Tech M&A Podcast Timothy Goddard the tech ...
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,The Tech Informist,17,"['Gadgets', 'Tech News', 'Technology']",Gadgets,the tech news show with two guys shooting the ...,"The Tech Informist - For fans of Apple, Google..."


## Build elements of recommender

### Data filtering and prep 

In [5]:
# create an input box for category
cat_input = input('Enter category: ')

Enter category:  Comedy


In [6]:
# create a list of indices of podcasts that fall under the user input category
idx = []
for i in range(df.shape[0]):
    if cat_input in df['cat_list'][i]:
        idx.append(i)

In [7]:
# create a new df of the podcasts that fall under the user input category
df1 = df.iloc[idx]
df1.reset_index(drop=True, inplace=True)

In [8]:
df1

Unnamed: 0,title,description,language,categories,author,desc_length,cat_list,primary_cat,clean_desc,all_text
0,Fire in the Hole Podcast,"What's wrong with the world? We don't know, bu...",English,Comedy | Podcasting | Technology,Richard and Jason,143,"['Comedy', 'Podcasting', 'Technology']",Comedy,what' s wrong with the world we don' t know bu...,Fire in the Hole Podcast Richard and Jason wha...
1,Yvette and Glen’s Anytime Podcast Show,It's Yvette Fielding (http://twitter.com/Yfiel...,English,TV & Film | Society & Culture | Comedy,Audioboom,54,"['TV & Film', 'Society & Culture', 'Comedy']",TV & Film,it' s yvette fielding http twitter com yfieldi...,Yvette and Glen’s Anytime Podcast Show Audiobo...
2,Junkfood Cinema,Junkfood Cinema is a shame-free celebration of...,English,TV & Film | Society & Culture | Comedy,Junkfood Cinema,21,"['TV & Film', 'Society & Culture', 'Comedy']",TV & Film,junkfood cinema is a shame free celebration of...,Junkfood Cinema Junkfood Cinema junkfood cinem...
3,Mike & Carla Morning Show,"For almost 30 years, Las Vegas has grown up wi...",English,Music | Comedy,96.3 KKLZ,62,"['Music', 'Comedy']",Music,for almost 30 years las vegas has grown up wit...,Mike & Carla Morning Show 96.3 KKLZ for almost...
4,90s Baby Show,Podcast & Radio show. Live On @RadarRadioLDN E...,English,Comedy,90s Baby Show,15,['Comedy'],Comedy,podcast radio show live on radarradioldn every...,90s Baby Show 90s Baby Show podcast radio show...
...,...,...,...,...,...,...,...,...,...,...
9650,BDBM podcast,Unfiltered discussion between two very deferri...,English,Comedy,BDBM,21,['Comedy'],Comedy,unfiltered discussion between two very deferri...,BDBM podcast BDBM unfiltered discussion betwee...
9651,This Epic Disaster,"A weekly podcast that examines the screw ups, ...",English,Society & Culture | Comedy | Arts,This Epic Disaster,57,"['Society & Culture', 'Comedy', 'Arts']",Society & Culture,a weekly podcast that examines the screw ups t...,This Epic Disaster This Epic Disaster a weekly...
9652,Big Hollow Radio,Big Hollow Radio is a podcast that presents it...,English,Comedy,Big Hollow Radio,64,['Comedy'],Comedy,big hollow radio is a podcast that presents it...,Big Hollow Radio Big Hollow Radio big hollow r...
9653,Blood T**s and Gore,An Austin-based horror podcast with a mad love...,English,TV & Film | Comedy | Arts,BloodT**sAndGore,25,"['TV & Film', 'Comedy', 'Arts']",TV & Film,an austin based horror podcast with a mad love...,Blood T**s and Gore BloodT**sAndGore an austin...


In [10]:
# create an input box for blurb
user_text = input('Describe what you are in the mood to listen to: ')

Describe what you are in the mood to listen to:  I want a true crime comedy podcast that is casual


In [11]:
# from Patrick Cudo to create a new df of the user input blurb
df2 = pd.DataFrame(columns=['all_text'])
df2['all_text'] = [user_text]
df2

Unnamed: 0,all_text
0,I want a true crime comedy podcast that is casual


In [12]:
# append the user input df to the filtered df
df1 = df1.append(df2, ignore_index=True)
df1.tail()

Unnamed: 0,title,description,language,categories,author,desc_length,cat_list,primary_cat,clean_desc,all_text
9651,This Epic Disaster,"A weekly podcast that examines the screw ups, ...",English,Society & Culture | Comedy | Arts,This Epic Disaster,57.0,"['Society & Culture', 'Comedy', 'Arts']",Society & Culture,a weekly podcast that examines the screw ups t...,This Epic Disaster This Epic Disaster a weekly...
9652,Big Hollow Radio,Big Hollow Radio is a podcast that presents it...,English,Comedy,Big Hollow Radio,64.0,['Comedy'],Comedy,big hollow radio is a podcast that presents it...,Big Hollow Radio Big Hollow Radio big hollow r...
9653,Blood T**s and Gore,An Austin-based horror podcast with a mad love...,English,TV & Film | Comedy | Arts,BloodT**sAndGore,25.0,"['TV & Film', 'Comedy', 'Arts']",TV & Film,an austin based horror podcast with a mad love...,Blood T**s and Gore BloodT**sAndGore an austin...
9654,Eileen and Steph: Everything's Fine,"In a world where everything is falling apart, ...",English,Comedy,Eileen and Steph,18.0,['Comedy'],Comedy,in a world where everything is falling apart o...,Eileen and Steph: Everything's Fine Eileen and...
9655,,,,,,,,,,I want a true crime comedy podcast that is casual


### Modeling

In [13]:
# from Aaron Hume to create add custom words to stop_words list
stop_words = list(TfidfVectorizer(stop_words='english').get_stop_words()) + ['Podcast','podcast','Podcasts','podcasts']

In [14]:
# initializing Tfidf and fit transform on df
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (2, 3), stop_words = stop_words)
tfdf = tf.fit_transform(df1['all_text'])
tfdf

<9656x459910 sparse matrix of type '<class 'numpy.float64'>'
	with 519700 stored elements in Compressed Sparse Row format>

In [17]:
# find similarity score between the descriptions
similarity = linear_kernel(tfdf, tfdf)
similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [19]:
df1.index[-1]

9655

In [20]:
# https://www.kaggle.com/switkowski/building-a-podcast-recommendation-engine

x = df1.index[-1]
similar_idx = similarity[x].argsort(axis = 0)[-4:-1]
for i in similar_idx:
    print(similarity[x][i], '-', df1['title'][i], '-', df1['description'][i], '\n')
print('Original - ' + df1['all_text'][x])

0.06555191901275173 - Kim Knows Nothing: A True Crime Podcast - Every episode of this true crime podcast, Stacy researches a case and masterfully crafts a story for Kim, who knows nothing about true crime and and mainly responds with pop culture references, sound effects, and jokes.  

0.10280949490413133 - Give Me Murder Or Give Me Death - A true crime comedy podcast with Lena Berry and Alex Stypula.  Each week they dive into a horrific crime and try to wring out a bit of humor from some of the darkest aspects of humanity.  Sometimes Lena hosts, sometimes Alex does, either way you'll experience a twisted journey through some of the darkest humor this side of Ed Gein's house.  Alex and Lena are longtime friends who only recently discovered their shared love of true crime and had to do a podcast about it. We make murder funny. 

0.15823105916656957 - Married to Murder - A true crime comedy podcast that covers the stories of couples who kill together or each other. 'Til Death Do Us Part!

## Function for recommender

In [25]:
# Patrick inspo
def test(df):
    # get category to filter data
    cat_input = input('Enter category: ')
    
    # filter data by category, create new df, reset index - adjusted
    idx = []
    for i in range(df.shape[0]):
        if cat_input in df['cat_list'][i]:
            idx.append(i)
            
    df1 = df.iloc[idx]
    df1.reset_index(drop=True, inplace=True)
    
    # get input string from user
    user_text = input('Describe what you are in the mood to listen to: ')
    
    # add user input to df
    df2 = pd.DataFrame(columns=['all_text'])
    df2['all_text'] = [user_text]

    df1 = df1.append(df2, ignore_index=True)
    
    # instantiate TfidfVectorizer
    stop_words = list(TfidfVectorizer(stop_words='english').get_stop_words()) + ['Podcast','podcast','Podcasts','podcasts']
    tf = TfidfVectorizer(analyzer = 'word', ngram_range = (2, 3), stop_words = stop_words)
    # fit and transform on filtered df
    tfdf = tf.fit_transform(df1['all_text'])
    # use linear_kernel to create array of similarities
    similarity = linear_kernel(tfdf,tfdf)
    # https://www.kaggle.com/switkowski/building-a-podcast-recommendation-engine - adjusted
    x = df1.index[-1]
    print (x)
    similar_idx = similarity[x].argsort(axis = 0)[-4:-1]
    for i in similar_idx:
        print(similarity[x][i], '-', df1['title'][i], '-', df1['description'][i], '\n')
    print('Original - ' + df1['all_text'][x])

In [32]:
test(df)

Enter category:  Comedy
Describe what you are in the mood to listen to:  true crime


9655
0.23243017200524976 - It's About Damn Crime - Tired of hearing the same old true crime stories? Then you've  come to the right place! Welcome to It's About Damn Crime. A true crime podcast where co-hosts Brittney and Justine discuss true crime cases featuring people of color. So please give it a listen. Promise some of these cases will be brand new to you!  

0.23938354981552307 - S'laughter: True Crime Podcast - Listening to Slaughter doesn't make you a psycho, killing people does. 

0.2521001168871202 - Kim Knows Nothing: A True Crime Podcast - Every episode of this true crime podcast, Stacy researches a case and masterfully crafts a story for Kim, who knows nothing about true crime and and mainly responds with pop culture references, sound effects, and jokes.  

Original - true crime


In [29]:
df['primary_cat'].unique()

array(['Technology', 'Tech News', 'Podcasting', 'Business News',
       'Gadgets', 'Management & Marketing', 'Music', 'Comedy', 'Arts',
       'Hobbies', 'Business', 'Performing Arts', 'Literature',
       'TV & Film', 'Health', 'Games & Hobbies', 'Society & Culture',
       'Video Games', 'Other Games', 'Sports & Recreation',
       'News & Politics', 'Education', 'Religion & Spirituality',
       'Christianity', 'Other', 'Careers', 'Judaism', 'Spirituality',
       'Non-Profit', 'Philosophy', 'Kids & Family', 'Hinduism',
       'Self-Help', 'Visual Arts', 'Food', 'Government & Organizations',
       'Science & Medicine', 'Alternative Health', 'Personal Journals',
       'Natural Sciences', 'Medicine', 'Places & Travel',
       'Social Sciences', 'History', 'Training', 'Sexuality',
       'Fitness & Nutrition', 'National', 'Investing',
       'College & High School', 'Professional', 'Language Courses',
       'Higher Education', 'K-12', 'Outdoor', 'Automotive', 'Amateur',
       'Desi

In [33]:
# True crime is missing as a category, some results are bad (ie. bad parenting)

## Script for App

In [19]:
## create script to take data already filtered by category + user input blurb in json, 
## run model/recommender, 
## return recommendations as a df in json

import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def api_test(json_data):
#     # get category to filter data
#     cat_input = input('Enter category: ')
    
#     # filter data by category, create new df, reset index - adjusted
#     idx = []
#     for i in range(df.shape[0]):
#         if cat_input in df['cat_list'][i]:
#             idx.append(i)
            
#     df1 = df.iloc[idx]
#     df1.reset_index(drop=True, inplace=True)
    
#     # get input string from user
#     user_text = input('Describe what you are in the mood to listen to: ')
    
#     # add user input to df
#     df2 = pd.DataFrame(columns=['all_text'])
#     df2['all_text'] = [user_text]

#     df1 = df1.append(df2, ignore_index=True)

    # convert json to pandas dataframe
    df1 = pd.DataFrame(json_data)
    
    # instantiate TfidfVectorizer
    stop_words = list(TfidfVectorizer(stop_words='english').get_stop_words()) + ['Podcast','podcast','Podcasts','podcasts']
    tf = TfidfVectorizer(analyzer = 'word', ngram_range = (2, 3), stop_words = stop_words)
    
    # fit and transform on all_text column
    tfdf = tf.fit_transform(df1['all_text'])
    
    # use linear_kernel to create array of similarities
    similarity = linear_kernel(tfdf,tfdf)
    
    # https://www.kaggle.com/switkowski/building-a-podcast-recommendation-engine - adjusted
    x = df1.index[-1]
    similar_idx = similarity[x].argsort(axis = 0)[-4:-1]
    
    # create df of recommendations
    similarity_score = []
    podcast = []
    description = []
    
    for i in similar_idx:
        similarity_score.append(similarity[x][i])
        podcast.append(df1['title'][i])
        description.append(df1['description'][i])

    recommendations = pd.DataFrame(
    {'similarity_score': similarity_score,
     'podcast': podcast,
     'description': description})
    
    return recommendations

UsageError: Line magic function `%%writefile` not found.


In [16]:
## test

def test(df):
    # get category to filter data
    cat_input = input('Enter category: ')
    
    # filter data by category, create new df, reset index - adjusted
    idx = []
    for i in range(df.shape[0]):
        if cat_input in df['cat_list'][i]:
            idx.append(i)
            
    df1 = df.iloc[idx]
    df1.reset_index(drop=True, inplace=True)
    
    # get input string from user
    user_text = input('Describe what you are in the mood to listen to: ')
    
    # add user input to df
    df2 = pd.DataFrame(columns=['all_text'])
    df2['all_text'] = [user_text]

    df1 = df1.append(df2, ignore_index=True)
    
    # instantiate TfidfVectorizer
    stop_words = list(TfidfVectorizer(stop_words='english').get_stop_words()) + ['Podcast','podcast','Podcasts','podcasts']
    tf = TfidfVectorizer(analyzer = 'word', ngram_range = (2, 3), stop_words = stop_words)
    # fit and transform on filtered df
    tfdf = tf.fit_transform(df1['all_text'])
    # use linear_kernel to create array of similarities
    similarity = linear_kernel(tfdf,tfdf)
    # https://www.kaggle.com/switkowski/building-a-podcast-recommendation-engine - adjusted
    x = df1.index[-1]
    similar_idx = similarity[x].argsort(axis = 0)[-6:-1]

    similarity_score = []
    podcast = []
    description = []
    
    for i in similar_idx:
        similarity_score.append(similarity[x][i])
        podcast.append(df1['title'][i])
        description.append(df1['description'][i])

    recommendations = pd.DataFrame(
    {'similarity_score': similarity_score,
     'podcast': podcast,
     'description': description})
    
    return recommendations

In [17]:
test(df)

Enter category:  Comedy
Describe what you are in the mood to listen to:  true crime


Unnamed: 0,similarity_score,podcast,description
0,0.217346,Dark Topic: A True Crime Podcast,Dark Topic is a True Crime Podcast created to ...
1,0.219555,Two of a Kind and True Crime,Mother and daughter True Crime podcast. Come f...
2,0.23243,It's About Damn Crime,Tired of hearing the same old true crime stori...
3,0.239384,S'laughter: True Crime Podcast,Listening to Slaughter doesn't make you a psyc...
4,0.2521,Kim Knows Nothing: A True Crime Podcast,"Every episode of this true crime podcast, Stac..."
