In [1]:
import pandas as pd
def get_qrels(filename):
  # read the file from TrEC that contains the relevance scores
  with open(filename) as f:
      contents = f.read()
  # shorten the episode ID and split the time segment into seperate field
  lines = contents.replace('spotify:episode:','').replace('_','\t').split("\n")
  data = [line.split('\t') for line in lines]
  # create dataframe and remove the second column which seems to have no value
  df = pd.DataFrame(data,columns = ['query_id', 'useless','episode','segment','relevance'])
  df = df.drop('useless', axis=1)
  df['relevance'] = df['relevance'].astype(int)
  # the relevance scores are on scale 0-4, instead consider if it is relevant or not
  df['binary'] = df['relevance'] > 0
  df['binary'] = df['binary'].astype(int)
  # if an episode has relevance at 'some' point then consider the whole episode to be relevant
  df2 = df.groupby(['episode','query_id'])['binary'].max()
  # adjusting the dataframe into a list with (query_id, document_id, judgement)
  cols = ['query_id', 'episode', 'binary']
  df2 = df2.reset_index()
  df2['query_id'] = df2['query_id'].astype(int)
  qrels = df2[cols].values.tolist()
  return qrels
qrels = get_qrels('/workspaces/Spotify_Information_Retrieval/Files/2020_train_qrels.list.txt')
qrels[:5]

[[7, '02cvhI2v4wORCGTvWQxF7P', 0],
 [8, '03G8qxp2IYaaX1RqYFwPs9', 0],
 [7, '04536ZKxCGV88Yj0TT0oYM', 0],
 [7, '047y46T88lyQEHIEEVWxgy', 0],
 [6, '04P7on6BaQikQsMmTVlMNB', 1]]

In [4]:

def get_queries(filename):
  # read the file from TrEC that contains the query titles
  with open(filename) as f:
    contents = f.read()
  from bs4 import BeautifulSoup
  soup = BeautifulSoup(contents)
  query_list = [query.text for query in soup.find_all('description')]
  print(query_list)
  # put the queries into a dictionary but need to start numbering at 1
  queries = {i+1: val for i, val in enumerate(query_list)}
  return queries
queries = get_queries('/workspaces/Spotify_Information_Retrieval/Files/podcasts_2020_topics_train.xml')

['What were people saying about the spread of the novel coronavirus NCOV-19 in Wuhan at the end of 2019?', 'What were people saying about Greta Thunberg’s sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change?', 'In May 2019 astronomers released the first-ever picture of a black hole. I would like to hear some conversations and educational discussion about the science of astronomy, black holes, and of the picture itself.', 'I remember hearing a podcast that had a story about a kid riding some kind of bird. I want to find it again.', 'Someone told me about a podcast interview with Daniel Ek, CEO of Spotify, about the founding and early days of Spotify. I would like to find the show and episode that contains that interview. Other interviews with Ek are relevant as well.', 'Former First Lady Michelle Obama’s memoir Becoming was published in early 2019. What were people saying about it?', 'Anna Sorokina moved to New York City in 2013 and p

In [3]:
queries

{1: 'What were people saying about the spread of the novel coronavirus NCOV-19 in Wuhan at the end of 2019?',
 2: 'What were people saying about Greta Thunberg’s sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change?',
 3: 'In May 2019 astronomers released the first-ever picture of a black hole. I would like to hear some conversations and educational discussion about the science of astronomy, black holes, and of the picture itself.',
 4: 'I remember hearing a podcast that had a story about a kid riding some kind of bird. I want to find it again.',
 5: 'Someone told me about a podcast interview with Daniel Ek, CEO of Spotify, about the founding and early days of Spotify. I would like to find the show and episode that contains that interview. Other interviews with Ek are relevant as well.',
 6: 'Former First Lady Michelle Obama’s memoir Becoming was published in early 2019. What were people saying about it?',
 7: 'Anna Sorokina moved to 

In [26]:
import glob
import json
# read all the json files in the folder called Documents
path = '/workspaces/Spotify_Information_Retrieval/Documents/*'
files = glob.glob(path)

def get_transcripts(inc_desc=False):
    transcripts = []
    ep_IDs = []
    titles = []
    durations = []
    # loop through each of the files extracting data
    for file in files:
        with open(file) as f:
          contents = json.load(f)
        # show_ID is slightly misleading name as would not be unique so renamed ep_ID
          ep_ID = contents["showID"]
          ep_IDs.append(ep_ID)
        # the transcript is a list so change to string
          transcript = ''.join(contents["transcript"])
        # parameter set to true then will include episode information (name and description) in the corpus
          if inc_desc:
              ep_info = contents["episode_name"] + contents["episode_description"]
              transcript = transcript + ep_desc
          transcripts.append(transcript)
          title = contents["show_name"] + " - " + contents["episode_name"]
          titles.append(title)
        # episode
          duration = contents["duration"]
          durations.append(duration)
    return ep_IDs, transcripts, titles, durations

In [28]:
ep_IDs, corpus, titles, durations = get_transcripts(inc_desc=True)
corpus

 "See something, say something. It’s a mantra many live by. If you see something strange, call it in or make someone aware, even if it seems innocuous. Jennifer San Marco had strange behaviors. It was clear to many that the woman suffered from mental illness that was being untreated. But, many wrote it off. And, on January 30th 2006 her strange behaviors bubbled over and the Goleta Postal Facility shootings began. Jennifer San Marco Kills (2006) Become a supporter of this podcast on Patreon: https://www.patreon.com/morningcupofmurder Follow Morning Cup of Murder on Twitter: https://twitter.com/cupofmurder @cupofmurder Follow MCOM on Instagram: @morningcupofmurder Have a Murder or strange true crime story you want to share, email the show here: morningcupofmurder@gmail.com Morning Cup of Murder is researched, written and performed by Korina Biemesderfer. Follow Korina on Instagram: @kbiemesderfer  ---   Send in a voice message: https://anchor.fm/morning-cup-of-murder/message Support thi

In [14]:
# index
for i in range(len(ep_IDs)):
  print(f'{ep_IDs[i]}: {titles[i]}')

00BlbbH2PvBJ8M3CApZ7ou: Mythology - Best of 2019: The Amazons Pt. 2
000HP8n3hNIfglT2wSI2cA: Morning Cup Of Murder - The Goleta Postal Facility shootings- January 30 2020 - Today in True Crime History
00h18NlParejEuFrdCf5dC: Only Looking Up - 5 Fashion Essentials To Invest In – Adulting 101
00f13q3KxUWM67GjNv12ij: X-Pac 12360 - A Wrestling Podcast - Sean Waltman Talks G1 Climax & RAW Highlights, Paco Alonso Passes Away | X-Pac 1 2 360 #146
00gQxUFKCvFhYQfZNFofo6: Adulting With Friends  - Adulting with Simone Gannon
00fg6LJN0BZF7tWiQN0Ywo: Narcissism Recovery Podcast  - Narcissism and Mental Illness 
00CoLqpC5J4vqYzGa7SDkS: unsigned podcast - #8: John Vincent Salcedo - Director of Digital Marketing, Columbia Records
00d5EDsEf5qmUhoF8m5IvJ: The Conscious Fooodie - Ep #1 Food we feed ourselves 
00iMyJK2hN1jhcpyK2xkHO: Today in True Crime  - October 13, 2019: Arlis Kay Perry
00bQYCFm5Xfzn6RSUEBS9m: Political Scandals  - Scandal 53: “The D.C. Madam”
00HGJXXGgvppuVCvdyEO4B: I am. I have - Int

In [11]:
index = dict(enumerate(ep_IDs))
index

{0: '0a0HuaT4Vm7FoYvccyRRQj',
 1: '0a5DJZA0NWLIGWTMasTNYq',
 2: '0A0SL2WQhWgGV6phvwJgU7',
 3: '0a4HRdmYYB4jQrc0bGGPkS',
 4: '0A1lKioyXNKIBRGAzYMUQi',
 5: '0A1iNmwn0VjegiQ8kBXc4u',
 6: '0a0iyqjSgKKZ49eOKZYpY9',
 7: '0A0SrnP0qm15L5Hv27sMAI',
 8: '0a0jLPxKIjaDYUaZPbhsWO',
 9: '0A0rUcBRvpL436mIflNoVg',
 10: '0a2rz6SLuoQagFLypkidtg',
 11: '0a1edJv75fmg5bnYDgeZ0v',
 12: '0A1ejb0nplbJ7Z3qDF31D9',
 13: '0A4rSk465szkl19Zb6UXA2',
 14: '0A1bZfQ1C2FOUMfVJ7ugpa',
 15: '0a1OhqAgMKmW5JwKWG7LbF',
 16: '0A2B0uRupGRFkIszMIoni5',
 17: '0A0MxX8L2YZEZgJGApE7w4',
 18: '0a1xawFR0oGJVP672q5ZuD',
 19: '0a2BQTox8cxHkTZQzkIpf4',
 20: '0A2xwMoDNIlwkJGsaFq68I',
 21: '0a1jXZ8LRPJRYWA9mHLW7w',
 22: '0A2jf2xBqee3KXv2AUfsp7',
 23: '0a1YNZdeCkdMTl61UQgs9M',
 24: '0A0f1WM7IttoJ61xzWCPK5',
 25: '0A5BLuSdVS2uQCC8uiuO2N',
 26: '0A0TSLljzX8akWoSYqF6Hm',
 27: '0a5cpbh8YLko5O9ZzhtaBa',
 28: '0a0Ikpt3GH8xSKaMZm4RYw',
 29: '0A4HFH7rgoBjz44K0ZABXl',
 30: '0a0C2tbL45RMmL9EmEVC2R',
 31: '0A1GTvNCft6B1bVD2Guioo',
 32: '0a0C9jWzl6eU

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# vectorize and get vocabulary
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()

In [34]:
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
df.index = list(ep_IDs)
df

Unnamed: 0,00,000,10,100,1000,100th,102,1099,10d,10th,...,zip,ziploc,zipping,zodiac,zombie,zombies,zone,zones,zoom,ëif
0a0HuaT4Vm7FoYvccyRRQj,0,0,0,0,0,0,2,0,0,0,...,0,0,0,5,0,0,0,0,0,0
0a5DJZA0NWLIGWTMasTNYq,1,2,2,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0A0SL2WQhWgGV6phvwJgU7,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
0a4HRdmYYB4jQrc0bGGPkS,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0A1lKioyXNKIBRGAzYMUQi,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0A1iNmwn0VjegiQ8kBXc4u,3,2,10,0,5,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
0a0iyqjSgKKZ49eOKZYpY9,1,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,1,1
0A0SrnP0qm15L5Hv27sMAI,1,0,4,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0a0jLPxKIjaDYUaZPbhsWO,0,3,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
0A0rUcBRvpL436mIflNoVg,0,0,6,1,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0


In [13]:
def BM25_IDF_df(df):
  """
  This definition calculates BM25-IDF weights before hand as done last week
  """

  dfs = (df > 0).sum(axis=0)
  N = df.shape[0]
  idfs = -np.log(dfs / N)
  
  k_1 = 1.2
  b = 0.8
  dls = df.sum(axis=1) 
  avgdl = np.mean(dls)

  numerator = np.array((k_1 + 1) * df)
  denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) \
                         + np.array(df)

  BM25_tf = numerator / denominator

  idfs = np.array(idfs)

  BM25_score = BM25_tf * idfs
  return pd.DataFrame(BM25_score, columns=vocabulary)
bm25_df = BM25_IDF_df(df)
bm25_df.index = list(ep_IDs)

In [14]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values, score_q_d.values),
                key = lambda tup:tup[1],
                reverse=True)

In [18]:
dummy_queries = {1: 'people', 2: 'hello world'}
# queries = {i+1: val for i, val in enumerate(query_list)}
print(dummy_queries)

{1: 'people', 2: 'hello world'}


In [12]:
queries = dummy_queries
for count, query in enumerate(queries.values()):
  print(f'Query {count}: {query}')
  print('')
  print(retrieve_ranking(query, bm25_df))
  print('')

# example retrieve
# retrieve_ranking('people', bm25_df)

NameError: name 'dummy_queries' is not defined