In [1]:
import numpy as np
import pandas as pd 
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
podcasts = pd.read_csv('podcasts.csv')
podcasts.head()

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id
0,8d62d3880db2425b890b986e58aca393,"Ecommerce Conversations, by Practical Ecommerce",http://is4.mzstatic.com/image/thumb/Music6/v4/...,Listen in as the Practical Ecommerce editorial...,English,Technology,http://www.practicalecommerce.com,Practical Ecommerce,874457373
1,cbbefd691915468c90f87ab2f00473f9,Eat Sleep Code Podcast,http://is4.mzstatic.com/image/thumb/Music71/v4...,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,http://developer.telerik.com/,Telerik,1015556393
2,73626ad1edb74dbb8112cd159bda86cf,SoundtrackAlley,http://is5.mzstatic.com/image/thumb/Music71/v4...,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,https://soundtrackalley.podbean.com,Randy Andrews,1158188937
3,0f50631ebad24cedb2fee80950f37a1a,The Tech M&A Podcast,http://is1.mzstatic.com/image/thumb/Music71/v4...,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,http://www.corumgroup.com,Timothy Goddard,538160025
4,69580e7b419045839ca07af06cf0d653,"The Tech Informist - For fans of Apple, Google...",http://is4.mzstatic.com/image/thumb/Music62/v4...,The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,http://techinformist.com,The Tech Informist,916080498


In [4]:
podcasts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121175 entries, 0 to 121174
Data columns (total 9 columns):
uuid           121175 non-null object
title          121173 non-null object
image          121175 non-null object
description    119832 non-null object
language       121175 non-null object
categories     121175 non-null object
website        120005 non-null object
author         118678 non-null object
itunes_id      121175 non-null int64
dtypes: int64(1), object(8)
memory usage: 8.3+ MB


In [5]:
(podcasts.language.value_counts().to_frame().head(10))

Unnamed: 0,language
English,99316
German,4316
French,3874
Spanish,3637
Portuguese,1827
Swedish,1698
Chinese,1329
Japanese,1097
Italian,818
Russian,602


In [6]:
podcasts = podcasts[podcasts.language == 'English']

In [7]:
podcasts = podcasts.dropna(subset=['description'])
podcasts = podcasts.drop_duplicates('itunes_id')
sum(podcasts.description.isnull())

0

In [8]:
podcasts['description_length'] = [len(x.description.split()) for _, x in podcasts.iterrows()]
podcasts['description_length'].describe()

count    98173.000000
mean        39.168753
std        107.098547
min          0.000000
25%         11.000000
50%         26.000000
75%         51.000000
max      30157.000000
Name: description_length, dtype: float64

In [9]:
podcasts = podcasts[podcasts.description_length >= 20]

In [10]:
favorite_podcasts = ['The MFCEO Project', 'Up and Vanished', 'Lore']
favorites = podcasts[podcasts.title.isin(favorite_podcasts)]
favorites

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id,description_length
8793,5e36f857ce1e46b59285a348c7a650d5,Up and Vanished,http://is3.mzstatic.com/image/thumb/Music127/v...,Up and Vanished is an investigative podcast th...,English,Personal Journals | Society & Culture | News &...,https://art19.com/shows/up-and-vanished,audioBoom,1140596919,62
41353,f44c20c54eeb482ca6927d4b28a387aa,The MFCEO Project,http://is4.mzstatic.com/image/thumb/Music118/v...,Do you want to fulfill your own true potential...,English,Business | Self-Help | Health | Management & M...,http://themfceo.com/podcast,Vaughn Kohler,1012570406,117
69436,eb67f7b49f4c45bd8b69f5c13bebded2,Lore,http://is3.mzstatic.com/image/thumb/Music62/v4...,Lore is a bi-weekly podcast (and upcoming TV s...,English,Arts | Literature | History | Society & Culture,http://www.lorepodcast.com,Aaron Mahnke,978052928,41


In [11]:
podcasts = podcasts[~podcasts.isin(favorites)].sample(15000)
data = pd.concat([podcasts, favorites], sort = True).reset_index(drop = True)

In [12]:
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 3), min_df = 0, stop_words = "english")
tf_idf = tf.fit_transform(data['description'])
tf_idf

<15003x851518 sparse matrix of type '<class 'numpy.float64'>'
	with 1368255 stored elements in Compressed Sparse Row format>

In [13]:
similarity = linear_kernel(tf_idf, tf_idf)
similarity

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 7.35426392e-04, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        4.37185227e-04, 3.75531884e-04, 6.26389884e-04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 4.37185227e-04, ...,
        1.00000000e+00, 2.90885192e-04, 4.33390047e-03],
       [7.35426392e-04, 0.00000000e+00, 3.75531884e-04, ...,
        2.90885192e-04, 1.00000000e+00, 4.16774241e-04],
       [0.00000000e+00, 0.00000000e+00, 6.26389884e-04, ...,
        4.33390047e-03, 4.16774241e-04, 1.00000000e+00]])

In [14]:
x = data[data.title == 'Up and Vanished'].index[0]
similar_idx = similarity[x].argsort(axis = 0)[-4:-1]
for i in similar_idx:
    print(similarity[x][i], '-', data.title[i], '-', data.description[i], '\n')
print('Original - ' + data.description[x])

0.04492998758407775 - Visions of Education - Visions of Education is an education podcast where education professor Dan Krutka and high school teacher Michael Milton have conversations with educators to discuss their big ideas in education. 

0.047082987801297535 - The Vanished Podcast - 
      The Vanished is a true crime podcast that explores the stories of those who have gone missing.  Join host, Marissa Jones, as she investigates each case, often interviewing the loved ones who are still searching for answers.  
     

0.18360166779553216 - VANISHED: The Tara Calico Investigation - An investigative podcast that explores the unsolved disappearance of the University of New Mexico student, Tara Calico, a 29 year-old mystery that is hailed by Investigative Discovery Magazine as one of their top ten unsolved cases. Come along with host Melinda Esquibel, a former classmate to Tara Calico &amp; a filmmaker turned amateur investigator, as we examine the old case files from the Sheriff's of

In [15]:
x = data[data.title == 'Lore'].index[0]
similar_idx = similarity[x].argsort(axis = 0)[-4:-1]
for i in similar_idx:
    print(similarity[x][i], '-', data.title[i], '-', data.description[i], '\n')
print('Original - ' + data.description[x])

0.04046759143867778 - From & Inspired BY - A bi-weekly podcast that discusses the seminal soundtracks in movie history. We speak with the individuals behind the tunes -- musicians, music supervisors, actors, and more. 

0.04184147227347315 - The SciFi Podcast - Welcome to The Sci-Fi Podcast, a bi-weekly podcast focused on in-depth discussion of all things Science Fiction. The Sci-Fi Podcast features three hosts — Mattroid, Solo, and Station! — and frequent guests. Each episode we look at a science fiction theme or franchise and take the discussion where no show has gone before. 

0.04382610613892137 - The Talk-O-Tuesday Podcast - A multi subject bi-weekly podcast covering everything from entertainment to current social events. We will take to the streets and introduce a new Los Angeles Taco Tuesday spot every episode 

Original - Lore is a bi-weekly podcast (and upcoming TV show) about the dark historical tales that fuel our modern superstitions. Each episode explores the world of myst