In [1]:
import pandas as pd
def get_qrels(filename):
  # read the file from TrEC that contains the relevance scores
  with open(filename) as f:
      contents = f.read()
  # shorten the episode ID and split the time segment into seperate field
  lines = contents.replace('spotify:episode:','').replace('_','\t').split("\n")
  data = [line.split('\t') for line in lines]
  # create dataframe and remove the second column which seems to have no value
  df = pd.DataFrame(data,columns = ['query_id', 'useless','episode','segment','relevance'])
  df = df.drop('useless', axis=1)
  df['relevance'] = df['relevance'].astype(int)
  # the relevance scores are on scale 0-4, instead consider if it is relevant or not
  df['binary'] = df['relevance'] > 0
  df['binary'] = df['binary'].astype(int)
  # if an episode has relevance at 'some' point then consider the whole episode to be relevant
  df2 = df.groupby(['episode','query_id'])['binary'].max()
  # adjusting the dataframe into a list with (query_id, document_id, judgement)
  cols = ['query_id', 'episode', 'binary']
  df2 = df2.reset_index()
  df2['query_id'] = df2['query_id'].astype(int)
  qrels = df2[cols].values.tolist()
  return qrels
qrels = get_qrels('/workspaces/Spotify_Information_Retrieval/Files/2020_train_qrels.list.txt')
qrels[:5]

[[7, '02cvhI2v4wORCGTvWQxF7P', 0],
 [8, '03G8qxp2IYaaX1RqYFwPs9', 0],
 [7, '04536ZKxCGV88Yj0TT0oYM', 0],
 [7, '047y46T88lyQEHIEEVWxgy', 0],
 [6, '04P7on6BaQikQsMmTVlMNB', 1]]

In [4]:

def get_queries(filename):
  # read the file from TrEC that contains the query titles
  with open(filename) as f:
    contents = f.read()
  from bs4 import BeautifulSoup
  soup = BeautifulSoup(contents)
  query_list = [query.text for query in soup.find_all('description')]
  print(query_list)
  # put the queries into a dictionary but need to start numbering at 1
  queries = {i+1: val for i, val in enumerate(query_list)}
  return queries
queries = get_queries('/workspaces/Spotify_Information_Retrieval/Files/podcasts_2020_topics_train.xml')

['What were people saying about the spread of the novel coronavirus NCOV-19 in Wuhan at the end of 2019?', 'What were people saying about Greta Thunberg’s sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change?', 'In May 2019 astronomers released the first-ever picture of a black hole. I would like to hear some conversations and educational discussion about the science of astronomy, black holes, and of the picture itself.', 'I remember hearing a podcast that had a story about a kid riding some kind of bird. I want to find it again.', 'Someone told me about a podcast interview with Daniel Ek, CEO of Spotify, about the founding and early days of Spotify. I would like to find the show and episode that contains that interview. Other interviews with Ek are relevant as well.', 'Former First Lady Michelle Obama’s memoir Becoming was published in early 2019. What were people saying about it?', 'Anna Sorokina moved to New York City in 2013 and p

In [3]:
queries

{1: 'What were people saying about the spread of the novel coronavirus NCOV-19 in Wuhan at the end of 2019?',
 2: 'What were people saying about Greta Thunberg’s sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change?',
 3: 'In May 2019 astronomers released the first-ever picture of a black hole. I would like to hear some conversations and educational discussion about the science of astronomy, black holes, and of the picture itself.',
 4: 'I remember hearing a podcast that had a story about a kid riding some kind of bird. I want to find it again.',
 5: 'Someone told me about a podcast interview with Daniel Ek, CEO of Spotify, about the founding and early days of Spotify. I would like to find the show and episode that contains that interview. Other interviews with Ek are relevant as well.',
 6: 'Former First Lady Michelle Obama’s memoir Becoming was published in early 2019. What were people saying about it?',
 7: 'Anna Sorokina moved to 

In [8]:
import glob
import json
# read all the json files in the folder called Documents
path = '/workspaces/Spotify_Information_Retrieval/Documents/*'
files = glob.glob(path)

def get_transcripts():
    transcripts = []
    ep_IDs = []
    titles = []
    # loop through each of the files extracting data
    for file in files:
        with open(file) as f:
          contents = json.load(f)
        #   show_ID is slightly misleading name as would not be unique so renamed ep_ID
          ep_ID = contents["showID"]
          ep_IDs.append(ep_ID)
        # the transcript is a list so change to string
        # will create a transcript+ later that also includes episode_description
          transcript = ''.join(contents["transcript"])
          transcripts.append(transcript)
          title = contents["show_name"] + " - " + contents["episode_name"]
          titles.append(title)
    return ep_IDs, transcripts, titles
ep_IDs, corpus, titles = get_transcripts()

In [9]:
# index
for i in range(len(ep_IDs)):
  print(f'{ep_IDs[i]}: {titles[i]}')

0a0HuaT4Vm7FoYvccyRRQj: Back 2 HER  - Back 2 HER 
0a5DJZA0NWLIGWTMasTNYq: Tucker & Maura - Tucker & Maura
0A0SL2WQhWgGV6phvwJgU7: Variant Podcast - Variant Podcast
0a4HRdmYYB4jQrc0bGGPkS: AAA Pass: A Nation of Billions Podcast - AAA Pass: A Nation of Billions Podcast
0A1lKioyXNKIBRGAzYMUQi: Recruiter Startup - Dualta Doherty Rec2rec. - Recruiter Startup - Dualta Doherty Rec2rec.
0A1iNmwn0VjegiQ8kBXc4u: Cephalon Squared: A Warframe Podcast - Cephalon Squared: A Warframe Podcast
0a0iyqjSgKKZ49eOKZYpY9: The All Things Mavs Podcast - The All Things Mavs Podcast
0A0SrnP0qm15L5Hv27sMAI: West Didsbury & Chorlton AFC - West Didsbury & Chorlton AFC
0a0jLPxKIjaDYUaZPbhsWO: The Hawg Talk Podcast - The Hawg Talk Podcast
0A0rUcBRvpL436mIflNoVg: AFL Deep Dive - AFL Deep Dive
0a2rz6SLuoQagFLypkidtg: The Modern Warrior Podcast  - The Modern Warrior Podcast 
0a1edJv75fmg5bnYDgeZ0v: Light and Love - Light and Love
0A1ejb0nplbJ7Z3qDF31D9: The Language Learning Show - The Language Learning Show
0A4rSk465s

In [11]:
index = dict(enumerate(ep_IDs))
index

{0: '0a0HuaT4Vm7FoYvccyRRQj',
 1: '0a5DJZA0NWLIGWTMasTNYq',
 2: '0A0SL2WQhWgGV6phvwJgU7',
 3: '0a4HRdmYYB4jQrc0bGGPkS',
 4: '0A1lKioyXNKIBRGAzYMUQi',
 5: '0A1iNmwn0VjegiQ8kBXc4u',
 6: '0a0iyqjSgKKZ49eOKZYpY9',
 7: '0A0SrnP0qm15L5Hv27sMAI',
 8: '0a0jLPxKIjaDYUaZPbhsWO',
 9: '0A0rUcBRvpL436mIflNoVg',
 10: '0a2rz6SLuoQagFLypkidtg',
 11: '0a1edJv75fmg5bnYDgeZ0v',
 12: '0A1ejb0nplbJ7Z3qDF31D9',
 13: '0A4rSk465szkl19Zb6UXA2',
 14: '0A1bZfQ1C2FOUMfVJ7ugpa',
 15: '0a1OhqAgMKmW5JwKWG7LbF',
 16: '0A2B0uRupGRFkIszMIoni5',
 17: '0A0MxX8L2YZEZgJGApE7w4',
 18: '0a1xawFR0oGJVP672q5ZuD',
 19: '0a2BQTox8cxHkTZQzkIpf4',
 20: '0A2xwMoDNIlwkJGsaFq68I',
 21: '0a1jXZ8LRPJRYWA9mHLW7w',
 22: '0A2jf2xBqee3KXv2AUfsp7',
 23: '0a1YNZdeCkdMTl61UQgs9M',
 24: '0A0f1WM7IttoJ61xzWCPK5',
 25: '0A5BLuSdVS2uQCC8uiuO2N',
 26: '0A0TSLljzX8akWoSYqF6Hm',
 27: '0a5cpbh8YLko5O9ZzhtaBa',
 28: '0a0Ikpt3GH8xSKaMZm4RYw',
 29: '0A4HFH7rgoBjz44K0ZABXl',
 30: '0a0C2tbL45RMmL9EmEVC2R',
 31: '0A1GTvNCft6B1bVD2Guioo',
 32: '0a0C9jWzl6eU

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# vectorize and get vocabulary
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()

In [34]:
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
df.index = list(ep_IDs)
df.to_csv('test.csv',index=True)
df

Unnamed: 0,00,000,10,100,1000,100th,102,1099,10d,10th,...,zip,ziploc,zipping,zodiac,zombie,zombies,zone,zones,zoom,ëif
0a0HuaT4Vm7FoYvccyRRQj,0,0,0,0,0,0,2,0,0,0,...,0,0,0,5,0,0,0,0,0,0
0a5DJZA0NWLIGWTMasTNYq,1,2,2,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0A0SL2WQhWgGV6phvwJgU7,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
0a4HRdmYYB4jQrc0bGGPkS,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0A1lKioyXNKIBRGAzYMUQi,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0A1iNmwn0VjegiQ8kBXc4u,3,2,10,0,5,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
0a0iyqjSgKKZ49eOKZYpY9,1,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,1,1
0A0SrnP0qm15L5Hv27sMAI,1,0,4,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0a0jLPxKIjaDYUaZPbhsWO,0,3,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
0A0rUcBRvpL436mIflNoVg,0,0,6,1,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0


In [35]:
test = pd.read_csv('test.csv', index_col=0)
test

Unnamed: 0,00,000,10,100,1000,100th,102,1099,10d,10th,...,zip,ziploc,zipping,zodiac,zombie,zombies,zone,zones,zoom,ëif
0a0HuaT4Vm7FoYvccyRRQj,0,0,0,0,0,0,2,0,0,0,...,0,0,0,5,0,0,0,0,0,0
0a5DJZA0NWLIGWTMasTNYq,1,2,2,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0A0SL2WQhWgGV6phvwJgU7,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
0a4HRdmYYB4jQrc0bGGPkS,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0A1lKioyXNKIBRGAzYMUQi,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0A1iNmwn0VjegiQ8kBXc4u,3,2,10,0,5,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
0a0iyqjSgKKZ49eOKZYpY9,1,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,1,1
0A0SrnP0qm15L5Hv27sMAI,1,0,4,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0a0jLPxKIjaDYUaZPbhsWO,0,3,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
0A0rUcBRvpL436mIflNoVg,0,0,6,1,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0


In [13]:
def BM25_IDF_df(df):
  """
  This definition calculates BM25-IDF weights before hand as done last week
  """

  dfs = (df > 0).sum(axis=0)
  N = df.shape[0]
  idfs = -np.log(dfs / N)
  
  k_1 = 1.2
  b = 0.8
  dls = df.sum(axis=1) 
  avgdl = np.mean(dls)

  numerator = np.array((k_1 + 1) * df)
  denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) \
                         + np.array(df)

  BM25_tf = numerator / denominator

  idfs = np.array(idfs)

  BM25_score = BM25_tf * idfs
  return pd.DataFrame(BM25_score, columns=vocabulary)
bm25_df = BM25_IDF_df(df)
bm25_df.index = list(ep_IDs)

In [14]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values, score_q_d.values),
                key = lambda tup:tup[1],
                reverse=True)

In [18]:
dummy_queries = {1: 'people', 2: 'hello world'}
# queries = {i+1: val for i, val in enumerate(query_list)}
print(dummy_queries)

{1: 'people', 2: 'hello world'}


In [22]:
queries = dummy_queries
for count, query in enumerate(queries.values()):
  print(f'Query {count}: {query}')
  print('')
  print(retrieve_ranking(query, bm25_df))
  print('')

# retrieve for a known word
# retrieve_ranking('people', bm25_df)

Query 0: people

[('0A4rSk465szkl19Zb6UXA2', 0.4731324049359577), ('0a5DJZA0NWLIGWTMasTNYq', 0.47109886038189513), ('0a0C9jWzl6eUUhM6mxTwbn', 0.47094869404631795), ('0A0SL2WQhWgGV6phvwJgU7', 0.46786118578720803), ('0a1edJv75fmg5bnYDgeZ0v', 0.46589413105935606), ('0a0HuaT4Vm7FoYvccyRRQj', 0.4633682283846159), ('0a5cpbh8YLko5O9ZzhtaBa', 0.46252535758159874), ('0A0SrnP0qm15L5Hv27sMAI', 0.45977832615467756), ('0a0C2tbL45RMmL9EmEVC2R', 0.4589673582570983), ('0a0Ikpt3GH8xSKaMZm4RYw', 0.4584589742556114), ('0a4HRdmYYB4jQrc0bGGPkS', 0.45655152906766344), ('0A4ztiz48hDGPJyf0HECoH', 0.456030891044336), ('0A1lKioyXNKIBRGAzYMUQi', 0.45402781152438515), ('0a2BQTox8cxHkTZQzkIpf4', 0.4512574559199662), ('0a0jLPxKIjaDYUaZPbhsWO', 0.4507364639662578), ('0A2B0uRupGRFkIszMIoni5', 0.4500095045285661), ('0A1ejb0nplbJ7Z3qDF31D9', 0.4479978389630656), ('0a1jXZ8LRPJRYWA9mHLW7w', 0.44690753936068783), ('0A0rUcBRvpL436mIflNoVg', 0.4450037898419007), ('0a2rz6SLuoQagFLypkidtg', 0.43891998882658473), ('0A1iNmwn0Vj

In [25]:
def precision_at_k(query_id, k=5):
  """This function considers the k top ranking documents."""
  doc_ranking = retrieve_ranking(queries[query_id], bm25_df)

  # take only the document id, rather than score
  retrieved = [doc[0] for doc in doc_ranking[:k]]
  
  TP = 0
  for item in retrieved:
    for q, doc_id, judgement in qrels:
      if q == query_id and item == doc_id:
       TP += judgement
  #TP =  # number of true positives
  FP = k-TP  # number of false positives 

  precision = TP / (TP+FP)

  return TP, FP, precision


# Let's see what we get when we consider the top 5 ranking documents:
def print_precision_for_all_queries(k=5):
  for query_id, query in queries.items():
    TP, FP, precision = precision_at_k(query_id, k=k) 
    print(f'retrieved query "{query}" with precision @ {k}: {precision} (TP: {TP}, FP: {FP})')
print_precision_for_all_queries()

retrieved query "people" with precision @ 5: 0.0 (TP: 0, FP: 5)
retrieved query "hello world" with precision @ 5: 0.0 (TP: 0, FP: 5)
