In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import linear_kernel
from operator import itemgetter

In [17]:
df = pd.read_csv('skeptoid_transcripts.csv') # data set from https://www.kaggle.com/datasets/sentinel3734/skeptoid-podcast-transcripts

In [18]:
df.head()

Unnamed: 0,title,episode_number,publication_date,quote,by,categories,citation,text,url
0,Religion as a Moral Center,1,"October 3, 2006",Religion is not necessary for a good moral cen...,Brian Dunning,Religion,"Dunning, B. ""Religion as a Moral Center."" Skep...",Today we pull open the drawer in the motel bur...,https://skeptoid.com/episodes/4001
1,New Age Energy,2,"October 11, 2006","An examination of energy, as new agers use the...",Brian Dunning,Fads; General Science; Paranormal,"Dunning, B. ""New Age Energy."" Skeptoid Podcast...","I'm feeling a little low today, so let's tap i...",https://skeptoid.com/episodes/4002
2,Ethics of Peddling the Paranormal,3,"October 19, 2006",A proposal that it might be ethical for non-be...,Brian Dunning,Alternative Medicine; Paranormal,"Dunning, B. ""Ethics of Peddling the Paranormal...",Today's we're going to re-examine a popular ma...,https://skeptoid.com/episodes/4003
3,Rods: Flying Absurdities,4,"October 24, 2006",There is neither evidence nor plausible hypoth...,Brian Dunning,Aliens & UFOs; Cryptozoology,"Dunning, B. ""Rods: Flying Absurdities."" Skepto...","From the cryptozoology files, we're going to l...",https://skeptoid.com/episodes/4004
4,Sustainable Sustainability,5,"November 1, 2006",Focus on the year's undisputed overused buzzwo...,Brian Dunning,Environment; Fads,"Dunning, B. ""Sustainable Sustainability."" Skep...",I bet you didn't know that Skeptoid is a susta...,https://skeptoid.com/episodes/4005


In [19]:
#Preprocess the text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english') #common english words should be ignored
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].fillna(''))

In [20]:
print(f'shape of data: {tfidf_matrix.shape}')

shape of data: (908, 43311)


In [21]:
#Calculate cosine similarity between episodes
normalized_tfidf_matrix = normalize(tfidf_matrix, axis=1, norm='l2')
cosine_sim = linear_kernel(normalized_tfidf_matrix, normalized_tfidf_matrix) #Normalized dot product

In [22]:
def recommend_episodes(title, cosine_sim, df):
    idx = df[df['title'] == title].index[0] #Finds the index of the episode with the specified title
    sim_scores = list(enumerate(cosine_sim[idx])) #list of tuples where each tuple contains the index of episode and its cosine similarity score
    print("Raw Similarity Scores:", sim_scores[0:12])
    sim_scores = sorted(sim_scores, key=itemgetter(1), reverse=True) #sorted in descending order based on the similarity scores
    print("Sorted Similarity Scores:", sim_scores[0:12])
    sim_scores = sim_scores[1:11]  # Exclude the episode itself and get top 10 similar episodes
    episode_indices = [i[0] for i in sim_scores] #contains the indices of the top 10 similar episodes
    return df['title'].iloc[episode_indices] #returns the titles of these recommended episodes

In [23]:
#Get recommendations for a specific episode
episode_title = df.iloc[0]['title']
recommendations = recommend_episodes(episode_title,cosine_sim,df)
print(f"\nRecommendations for '{episode_title}':")
print(recommendations)

Raw Similarity Scores: [(0, 1.0), (1, 0.02040849185821071), (2, 0.06522178763637153), (3, 0.017375450584572127), (4, 0.013027017702398606), (5, 0.02049384690815734), (6, 0.015623976376362366), (7, 0.02340332450368706), (8, 0.09624949476382677), (9, 0.028166981414867075), (10, 0.024273179993640648), (11, 0.17999599147262105)]
Sorted Similarity Scores: [(0, 1.0), (75, 0.21830617564996674), (11, 0.17999599147262105), (598, 0.12291754751095055), (422, 0.10290216062289181), (8, 0.09624949476382677), (529, 0.09410462155905543), (159, 0.09330777281048173), (264, 0.0912038519082113), (109, 0.09008115967274141), (124, 0.07762522630486192), (34, 0.07634216502654965)]

Recommendations for 'Religion as a Moral Center':
75                  Who Kills More, Religion or Atheism?
11           Killing Faith: Deconstructionist Christians
598    Listener Feedback: Creationism and More Dead Paul
422                                     12 Step Programs
8                               Sin: What's It Good For