In [1]:
import re
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer 

# Dataframe

In [2]:
df = pd.read_pickle('../data/pickle/podcasts_clean.pkl')

In [3]:
df['text'] = df[['title', 'producer', 'genre', 'description', 'episodes', 'reviews']].agg(' '.join, axis=1)

In [4]:
df.drop(columns=['producer', 'rating', 'num_ratings', 'num_episodes', 'description',
                 'link', 'episodes', 'reviews'], inplace=True)

In [None]:
df['subs_len'] = df.apply(lambda row: len(row.subs), axis=1)

In [5]:
df.head(2)

Unnamed: 0,title,genre,subs,text
0,Green Eggs and Dan,Arts,"[Point of Origin, Cal's Week in Review]",Green Eggs and Dan The Podglomerate Arts Takin...
1,Audio Poem of the Day,Arts,"[The New Yorker: Poetry, The New Yorker: The W...",Audio Poem of the Day Poetry Foundation Arts A...


# Preprocessing

In [6]:
stopwords = set(nltk.corpus.stopwords.words('english'))
add_stops = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
             'january', 'february', 'march', 'april', 'may', 'june', 'im', 'ive',
             'july', 'august', 'september', 'october', 'november', 'december',
             'nan', 'podcast', 'podcasts', 'every', 'new', 'weekly', 'week', 
             'stories', 'story', 'episode', 'episodes', 'listen', 'us', "'s", 'host', 'hosted', 'join']
for i in add_stops:
    stopwords.add(i)

In [7]:
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\s\w]+', '', text)
    text = re.sub(r"\S+\.org\S+", "", text)
    text = re.sub(r"\S+\.net\S+", "", text)
    text = re.sub(r"\S+\.edu\S+", "", text)
    text = re.sub(r"\S+\.gov\S+", "", text)
    tokenized_text = nltk.word_tokenize(text)
    new_tokenized = []
    for i in tokenized_text:
        if i not in stopwords and len(i) != 1:
            new_tokenized.append(lemmatizer.lemmatize(i))
    return(' '.join(new_tokenized))

In [8]:
df.text = df.text.map(preprocess_text)

In [9]:
df.head(2)

Unnamed: 0,title,genre,subs,text
0,Green Eggs and Dan,Arts,"[Point of Origin, Cal's Week in Review]",green egg dan podglomerate art taking look eat...
1,Audio Poem of the Day,Arts,"[The New Yorker: Poetry, The New Yorker: The W...",audio poem day poetry foundation art audio rec...


# Modelling

In [36]:
tests = list(df[df.subs_len != 0].sample(5).title)

In [17]:
def get_recommendations(matrix):
    for i in tests:
        print('\033[1m' + "Given:" + '\033[0m', i)
        index = df.loc[df.title == i].index[0]
        print('\033[1m' + "Given genre:" + '\033[0m', df.iloc[index]['genre'])
        array = list(enumerate(matrix[index]))
        sorted_array = sorted(array, key=lambda x:x[1], reverse=True)
        recs = []
        genres = []
        for j in range(6):
            if j == 0:
                continue
            rec_title = df.iloc[sorted_array[j][0]]['title']
            recs.append(rec_title)
            rec_genre = df.iloc[sorted_array[j][0]]['genre']
            genres.append(rec_genre)
        print('\033[1m' + "Top 5 recommendations:" + '\033[0m')
        print(recs)
        print('\033[1m' + "Top 5 recommendations' genre:" + '\033[0m')
        print(genres)
        print('\033[1m' + "Subscribers also subscribes to according to Apple Podcasts:" + '\033[0m')
        for k in df.loc[df.title == i].subs:
            substo = k
        print(substo)
        correct  = 0
        for l in recs:
            correct = correct + 1 if l in substo else correct
        print('\033[1m', correct , "out of 5 are accurate" + '\033[0m'+ "\n")

## CountVectorizer (Bag-of-words) + Cosine Similarity

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
cv = CountVectorizer()
bow_matrix = cv.fit_transform(df.text)
bow_cos_sim = cosine_similarity(bow_matrix)

In [37]:
get_recommendations(bow_cos_sim)

[1mGiven:[0m Red Ball
[1mGiven genre:[0m True Crime
[1mTop 5 recommendations:[0m
['CounterClock', 'Up and Vanished', 'Crime Junkie', 'O.C. Swingers', 'True Crime Chronicles']
[1mTop 5 recommendations' genre:[0m
['True Crime', 'True Crime', 'True Crime', 'True Crime', 'True Crime']
[1mSubscribers also subscribes to according to Apple Podcasts:[0m
['CounterClock', 'Full Body Chills', 'Murder in Oregon', 'Radio Rental', 'Your Own Backyard', 'Detective Trapp', 'Supernatural with Ashley Flowers', 'Urge to Kill', 'Dateline NBC', 'Morbid: A True Crime Podcast', 'Blood Ties', 'The Thing About Pam', 'Bad Batch']
[1m 1 out of 5 are accurate[0m

[1mGiven:[0m MCAT Basics (from MedSchoolCoach‪)‬
[1mGiven genre:[0m Science
[1mTop 5 recommendations:[0m
['The MCAT Podcast', 'OT Exam Prepper', "It's Been a Minute with Sam Sanders", 'Physician Assistant Exam Review', 'Becoming Something with Jonathan Pokluda']
[1mTop 5 recommendations' genre:[0m
['Science', 'Arts', 'Society & Culture