In [1]:
import re
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer

# Dataframe

In [2]:
df = pd.read_pickle('../data/pickle/podcasts_clean.pkl')

In [3]:
df['text'] = df[['title', 'producer', 'genre', 'description', 'episodes', 'reviews']].agg(' '.join, axis=1)

In [4]:
df.drop(columns=['producer', 'rating', 'num_ratings', 'num_episodes', 'description',
                 'link', 'episodes', 'reviews'], inplace=True)

In [5]:
df['subs_len'] = df.apply(lambda row: len(row.subs), axis=1)

In [6]:
df.head(2)

Unnamed: 0,title,genre,subs,text,subs_len
0,Green Eggs and Dan,Arts,"[Point of Origin, Cal's Week in Review]",Green Eggs and Dan The Podglomerate Arts Takin...,2
1,Audio Poem of the Day,Arts,"[The New Yorker: Poetry, The New Yorker: The W...",Audio Poem of the Day Poetry Foundation Arts A...,5


In [7]:
df.shape

(4460, 5)

In [8]:
titleswsubs = list(df[df.subs_len != 0].title)

In [9]:
subsset = set()
for i in df.subs:
    for j in i:
        subsset.add(j)

# Preprocessing

In [10]:
stopwords = set(nltk.corpus.stopwords.words('english'))
add_stops = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
             'january', 'february', 'march', 'april', 'may', 'june', 'im', 'ive',
             'july', 'august', 'september', 'october', 'november', 'december',
             'nan', 'podcast', 'podcasts', 'every', 'new', 'weekly', 'week', 
             'stories', 'story', 'episode', 'episodes', 'listen', 'us', "'s", 'host', 'hosted', 'join']
for i in add_stops:
    stopwords.add(i)

In [11]:
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\s\w]+', '', text)
    text = re.sub(r"\S+\.org\S+", "", text)
    text = re.sub(r"\S+\.net\S+", "", text)
    text = re.sub(r"\S+\.edu\S+", "", text)
    text = re.sub(r"\S+\.gov\S+", "", text)
    tokenized_text = nltk.word_tokenize(text)
    new_tokenized = []
    for i in tokenized_text:
        if i not in stopwords and len(i) != 1:
            new_tokenized.append(lemmatizer.lemmatize(i))
    return(' '.join(new_tokenized))

In [12]:
df.text = df.text.map(preprocess_text)

In [13]:
df.head(2)

Unnamed: 0,title,genre,subs,text,subs_len
0,Green Eggs and Dan,Arts,"[Point of Origin, Cal's Week in Review]",green egg dan podglomerate art taking look eat...,2
1,Audio Poem of the Day,Arts,"[The New Yorker: Poetry, The New Yorker: The W...",audio poem day poetry foundation art audio rec...,5


# Modelling

In [32]:
tests = list(df[df.subs_len >= 5].sample(3).title)

In [21]:
def get_recommendations(matrix):
    for i in tests:
        print('\033[1m' + "Given:" + '\033[0m', i)
        index = df.loc[df.title == i].index[0]
        print('\033[1m' + "Given genre:" + '\033[0m', df.iloc[index]['genre'])
        array = list(enumerate(matrix[index]))
        sorted_array = sorted(array, key=lambda x:x[1], reverse=True)
        recs = []
        genres = []
        for j in sorted_array:
            rec_title = df.iloc[j[0]]['title']
            rec_genre = df.iloc[j[0]]['genre']
            if rec_title == i or rec_title not in subsset:
                continue
            recs.append(rec_title)
            genres.append(rec_genre)
            if len(recs) == 5:
                break
        print('\033[1m' + "Top 5 recommendations:" + '\033[0m')
        print(recs)
        print('\033[1m' + "Top 5 recommendations' genre:" + '\033[0m')
        print(genres)
        print('\033[1m' + "Subscribers also subscribes to according to Apple Podcasts:" + '\033[0m')
        for k in df.loc[df.title == i].subs:
            substo = k
        print(substo)
        correct  = 0
        for l in recs:
            correct = correct + 1 if l in substo else correct
        print('\033[1m', correct , "out of 5 are accurate" + '\033[0m'+ "\n")

In [16]:
def accuracy(matrix):
    num_titles = len(titleswsubs)
    acc = 0
    for i in titleswsubs:
        index = df.loc[df.title == i].index[0]
        array = list(enumerate(matrix[index]))
        sorted_array = sorted(array, key=lambda x:x[1], reverse=True)
        recs = []
        for j in sorted_array:
            rec_title = df.iloc[j[0]]['title']
            if rec_title == i or rec_title not in subsset:
                continue
            recs.append(rec_title)
            if len(recs) == 5:
                break
        for k in df.loc[df.title == i].subs:
            substo = k
        correct = 0
        for l in recs:
            correct = correct + 1 if l in substo else correct
        if correct >= len(substo)//2 or correct == 5:
            acc += 1
    return round(acc/num_titles,5)

## CountVectorizer (Bag-of-words) + Cosine Similarity

In [17]:
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
def CountVectorizer(corpus):
    bigset = set()
    for sent in corpus:
        for word in sent.split():
            bigset.add(word)
    vocab = {}
    for index, word in enumerate(sorted(list(bigset))):
        vocab[word] = index
    row, col, val = [],[],[]
    for idx, sentence in enumerate(corpus):
        count_word = dict(Counter(sentence.split()))
        for word, count in count_word.items():
            col_index = vocab.get(word)
            if col_index >= 0:
                row.append(idx)
                col.append(col_index)
                val.append(count)
    return csr_matrix((val, (row, col)), shape=(len(corpus), len(vocab)))

In [19]:
bow_matrix = CountVectorizer(df.text)
bow_cos_sim = cosine_similarity(bow_matrix)

In [33]:
get_recommendations(bow_cos_sim)

[1mGiven:[0m Less Is Morgue
[1mGiven genre:[0m Fiction
[1mTop 5 recommendations:[0m
['Archive 81', 'Morbid: A True Crime Podcast', 'hanging out with audiophiles', 'The Popcast With Knox and Jamie', 'The Friend Zone']
[1mTop 5 recommendations' genre:[0m
['Fiction', 'Comedy', 'Music', 'TV & Film', 'Comedy']
[1mSubscribers also subscribes to according to Apple Podcasts:[0m
['Thirteen', 'The Call of the Void', 'A Voice From Darkness', 'Crypto-Z', 'The Storage Papers', 'Cryptids', 'Shadows at the Door: The Podcast', 'The Grey Rooms', 'How i Died']
[1m 0 out of 5 are accurate[0m

[1mGiven:[0m What If World - Stories for Kids
[1mGiven genre:[0m Kids & Family
[1mTop 5 recommendations:[0m
['Brains On! Science podcast for kids', 'Imaginary Worlds', 'But Why: A Podcast for Curious Kids', 'A Bit of Optimism', 'The Imagine Neighborhood']
[1mTop 5 recommendations' genre:[0m
['Kids & Family', 'Arts', 'Kids & Family', 'Society & Culture', 'Kids & Family']
[1mSubscribers also subsc

In [25]:
accuracy(bow_cos_sim)

0.25653

## TF-IDF + Cosine Similarity

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
tf_idf_matrix = tf.fit_transform(df["text"])
tf_idf_cos_sim = cosine_similarity(tf_idf_matrix)

In [36]:
get_recommendations(tf_idf_cos_sim)

[1mGiven:[0m Less Is Morgue
[1mGiven genre:[0m Fiction
[1mTop 5 recommendations:[0m
['The Phenomenon', 'Archive 81', 'This Is Actually Happening', 'Shadows at the Door: The Podcast', 'CARAVAN']
[1mTop 5 recommendations' genre:[0m
['Fiction', 'Fiction', 'True Crime', 'Fiction', 'Fiction']
[1mSubscribers also subscribes to according to Apple Podcasts:[0m
['Thirteen', 'The Call of the Void', 'A Voice From Darkness', 'Crypto-Z', 'The Storage Papers', 'Cryptids', 'Shadows at the Door: The Podcast', 'The Grey Rooms', 'How i Died']
[1m 1 out of 5 are accurate[0m

[1mGiven:[0m What If World - Stories for Kids
[1mGiven genre:[0m Kids & Family
[1mTop 5 recommendations:[0m
['Tea Time UNFILTERED With Lovelyti', 'The Eric Metaxas Show', 'Monsters Among Us Podcast', 'Bizarre Albums', 'Bobbycast']
[1mTop 5 recommendations' genre:[0m
['News', 'News', 'Society & Culture', 'Music', 'Music']
[1mSubscribers also subscribes to according to Apple Podcasts:[0m
['The Alien Adventures of 

In [37]:
accuracy(tf_idf_cos_sim)

0.29094