In [12]:
import pandas as pd
def get_qrels(filename):
  # read the file from TrEC that contains the relevance scores
  with open(filename) as f:
      contents = f.read()
  # shorten the episode ID and split the time segment into seperate field
  lines = contents.replace('spotify:episode:','').replace('_','\t').split("\n")
  data = [line.split('\t') for line in lines]
  # create dataframe and remove the second column which seems to have no value
  df = pd.DataFrame(data,columns = ['query_id', 'useless','episode','segment','relevance'])
  df = df.drop('useless', axis=1)
  df['relevance'] = df['relevance'].astype(int)
  # the relevance scores are on scale 0-4, instead consider if it is relevant or not
  df['binary'] = df['relevance'] > 0
  df['binary'] = df['binary'].astype(int)
  
  #this is the change to get the IDs starting from zero, not 1 like TREC had defined
  df['query_id'] = df['query_id'].astype(int)
  df['query_id'] = df['query_id']-1
  
  # if an episode has relevance at 'some' point then consider the whole episode to be relevant
  df2 = df.groupby(['episode','query_id'])['binary'].max()
  # adjusting the dataframe into a list with (query_id, document_id, judgement)
  cols = ['query_id', 'episode', 'binary']
  df2 = df2.reset_index()
  df2['query_id'] = df2['query_id'].astype(int)
  qrels = df2[cols].values.tolist()
  return qrels

qrels = get_qrels("Files/2020_train_qrels.list.txt")
qrels[:5]

[[6, '02cvhI2v4wORCGTvWQxF7P', 0],
 [7, '03G8qxp2IYaaX1RqYFwPs9', 0],
 [6, '04536ZKxCGV88Yj0TT0oYM', 0],
 [6, '047y46T88lyQEHIEEVWxgy', 0],
 [5, '04P7on6BaQikQsMmTVlMNB', 1]]

In [8]:
def get_test_qrels(filename):
  # read the file from TrEC that contains the relevance scores
    with open(filename) as f:
        contents = f.read()
    # shorten the episode ID and split the time segment into seperate field
    lines = contents.replace('spotify:episode:','').replace('_','\t').replace(' ','\t').split("\n")
    data = [line.split('\t') for line in lines]
    # create dataframe and remove the second column which seems to have no value
    df = pd.DataFrame(data,columns = ['query_id', 'useless','episode','segment','relevance'])
    df = df.drop('useless', axis=1)
    # to resolve that final row has 'None' entries
    df = df.drop(index=9426, axis=0)
    df['relevance'] = df['relevance'].astype(int)
    # the relevance scores are on scale 0-4, instead consider if it is relevant or not
    df['binary'] = df['relevance'] > 0
    df['binary'] = df['binary'].astype(int)
    # adjust query_id to start from 0
    df['query_id'] = df['query_id'].astype(int)
    df['query_id'] = df['query_id']-9
    # if an episode has relevance at 'some' point then consider the whole episode to be relevant
    df2 = df.groupby(['episode','query_id'])['binary'].max()
    # adjusting the datagrame into a list with (query_id, document_id, judgement)
    cols = ['query_id', 'episode', 'binary']
    df2 = df2.reset_index()
    df2['query_id'] = df2['query_id'].astype(int)
    qrels = df2[cols].values.tolist()
    return qrels
test_qrels = get_test_qrels("Files/2020_test_qrels.list.txt")
test_qrels[:5]

[[47, '003egucoR0umViUsMV0BaT', 1],
 [25, '00Enzfnt56rdTlK3MVYlmc', 0],
 [31, '00Le5AuqCKplHPI8FQMNPF', 1],
 [44, '00XqPiHMgFqiqpNwM76Fwm', 1],
 [8, '00YjLV45iqUYvAh3fJeISO', 0]]

In [7]:
# this works for both train and test sets
# amended to now include query with description
def get_queries(filename):
  # read the file from TrEC that contains the query titles
  with open(filename) as f:
    contents = f.read()
  lines = contents.replace('<query>','\t').replace('</query>','\t').replace('</description>','\t').replace('<description>','\t').split("\t")
  data = [line.split('\t') for line in lines]
  y = data[1::2]
  short = y[0::2]
  longer = y[1::2]
  full_list = []
  for i in range(len(short)):
    temp_list = ' '.join(short[i]) + ' ' + ' '.join(longer[i])
    full_list.append(temp_list)
  # put the queries into a dictionary but need to start numbering at 1
  queries = {i: val for i, val in enumerate(full_list)}
  return queries
queries = get_queries('Files/train_topics.txt')
queries

{0: 'coronavirus spread What were people saying about the spread of the novel coronavirus NCOV-19 in Wuhan at the end of 2019?',
 1: 'greta thunberg cross atlantic What were people saying about Greta Thunberg’s sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change?',
 2: 'black hole image In May 2019 astronomers released the first-ever picture of a black hole. I would like to hear some conversations and educational discussion about the science of astronomy, black holes, and of the picture itself.',
 3: 'story about riding a bird I remember hearing a podcast that had a story about a kid riding some kind of bird. I want to find it again.',
 4: 'daniel ek interview Someone told me about a podcast interview with Daniel Ek, CEO of Spotify, about the founding and early days of Spotify. I would like to find the show and episode that contains that interview. Other interviews with Ek are relevant as well.',
 5: 'michelle obama becoming Former F

In [5]:
import glob
import json
# read all the json files in the folder called Documents
path = 'Documents/*'
files = glob.glob(path)

def get_transcripts(inc_desc=False):
    transcripts = []
    ep_IDs = []
    titles = []
    durations = []
    # loop through each of the files extracting data
    for file in files:
        with open(file) as f:
          contents = json.load(f)
        # show_ID is slightly misleading name as would not be unique so renamed ep_ID
          ep_ID = contents["showID"]
          ep_IDs.append(ep_ID)
        # the transcript is a list so change to string
          transcript = ''.join(contents["transcript"])
        # parameter set to true then will include episode information (name and description) in the corpus
          if inc_desc:
              ep_info = contents["episode_name"] + contents["episode_description"]
              transcript = transcript + ep_info
          transcripts.append(transcript)
          title = contents["show_name"] + " - " + contents["episode_name"]
          titles.append(title)
        # episode
          duration = contents["duration"]
          durations.append(duration)
    return ep_IDs, transcripts, titles, durations

In [9]:
ep_IDs, corpus, titles, durations = get_transcripts(inc_desc=True)
corpus[:2]

 "There were two more murders 15 miles away arrived. They found the tell me how the investigator as reminiscent of see something say something. It's a mantra many live by if you see something strange call it in or make someone aware even if it seems innocuous. Jennifer San Marco had Strange Behaviors. It was clear to many that the woman suffer from mental illness that was being untreated but many wrote it off and on January 30th 2006 her strange behavior is bubbled over and the Goleta postal facility shootings began. So if you like your coffee hot, but your bones chilled sabacc and start your day with a morning cup of murder. On January 30th 2006 a woman entered the US Postal Service processing plant in Goleta California with a plan. She tailgated the car ahead of her and passed through the gate while in the parking lot shot and killed see Fairchild then turn to Malika Higgins and shot her at point-blank range. Next was Nikola Grant upon hearing the shots many of the employees ran to t

In [26]:
# index
for i in range(len(ep_IDs)):
  print(f'{ep_IDs[i]}: {titles[i]}')

0a0C2tbL45RMmL9EmEVC2R: Do I Have A Story For You! - Do I Have A Story For You!
0a0C9jWzl6eUUhM6mxTwbn: Stay Classic: A Warcraft Podcast - Stay Classic: A Warcraft Podcast
0A0f1WM7IttoJ61xzWCPK5: Vhite Rabbit Podcast - Vhite Rabbit Podcast
0a0HuaT4Vm7FoYvccyRRQj: Back 2 HER  - Back 2 HER 
0a0Ikpt3GH8xSKaMZm4RYw: Pack-A-Day: Your Daily Packers Podcast - Pack-A-Day: Your Daily Packers Podcast
0a0iyqjSgKKZ49eOKZYpY9: The All Things Mavs Podcast - The All Things Mavs Podcast
0a0jLPxKIjaDYUaZPbhsWO: The Hawg Talk Podcast - The Hawg Talk Podcast
0A0MxX8L2YZEZgJGApE7w4: Thai Endzone Podcast - Thai Endzone Podcast
0A0rUcBRvpL436mIflNoVg: AFL Deep Dive - AFL Deep Dive
0A0SL2WQhWgGV6phvwJgU7: Variant Podcast - Variant Podcast
0A0SrnP0qm15L5Hv27sMAI: West Didsbury & Chorlton AFC - West Didsbury & Chorlton AFC
0A0TSLljzX8akWoSYqF6Hm: Unsolved Murders: True Crime Stories - Unsolved Murders: True Crime Stories
0A1bZfQ1C2FOUMfVJ7ugpa: Chompers - Chompers
0a1edJv75fmg5bnYDgeZ0v: Light and Love - Light

In [12]:
index = dict(enumerate(ep_IDs))
index

{0: '00BlbbH2PvBJ8M3CApZ7ou',
 1: '000HP8n3hNIfglT2wSI2cA',
 2: '00h18NlParejEuFrdCf5dC',
 3: '00f13q3KxUWM67GjNv12ij',
 4: '00gQxUFKCvFhYQfZNFofo6',
 5: '00fg6LJN0BZF7tWiQN0Ywo',
 6: '00CoLqpC5J4vqYzGa7SDkS',
 7: '00d5EDsEf5qmUhoF8m5IvJ',
 8: '00iMyJK2hN1jhcpyK2xkHO',
 9: '00bQYCFm5Xfzn6RSUEBS9m',
 10: '00HGJXXGgvppuVCvdyEO4B',
 11: '00iI06WDE5LoQk0a0Z3pfj',
 12: '00bUBQRAVOMU42g1gXlzlX',
 13: '00btWOTB484ro3OgZKDle6',
 14: '00e3nyxpqhhreydZjZag7H',
 15: '000A9sRBYdVh66csG2qEdj',
 16: '00i4t4ifo0QAz95oZlnUiV',
 17: '00ENEumWcPZcXVgmJdMYMw',
 18: '00cpOS24Y5536ZbzDirP3K',
 19: '00G2HnDiIPFHXlclQTI03y',
 20: '00jdHz1eigCLYp4dLBrBae',
 21: '00APTUqFgGqIgcdRz0G6Gg',
 22: '00b1ZswBvK0ZH3XMbmTfqi',
 23: '00irXoP4wc6yOoC67PyKg5',
 24: '00gQsEcqYYZNDhnsFlPSfi',
 25: '00dGTzs1TgUbVtZnWL6qT8',
 26: '00i0DCBy8w68h8NStQbi2j',
 27: '00eo6cCvQbPvesSzZ18Vnm',
 28: '00HHtj8tNmu8rquP1R4u9F',
 29: '00A08geZvrEL1woiaAFT9x'}

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# vectorize and get vocabulary
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()

In [19]:
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
df.index = list(ep_IDs)
df[:5]

Unnamed: 0,00,000,02,07,10,100,101i,103,104,105,...,zebras,zelda,zemeckis,zero,zeus,zidane,zinc,zinedine,zkulptor,zoram
00BlbbH2PvBJ8M3CApZ7ou,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
000HP8n3hNIfglT2wSI2cA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00h18NlParejEuFrdCf5dC,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00f13q3KxUWM67GjNv12ij,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00gQxUFKCvFhYQfZNFofo6,8,0,0,0,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
def BM25_IDF_df(df):
  """
  This definition calculates BM25-IDF weights before hand as done last week
  """

  dfs = (df > 0).sum(axis=0)
  N = df.shape[0]
  idfs = -np.log(dfs / N)
  
  k_1 = 1.2
  b = 0.8
  dls = df.sum(axis=1) 
  avgdl = np.mean(dls)

  numerator = np.array((k_1 + 1) * df)
  denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) \
                         + np.array(df)

  BM25_tf = numerator / denominator

  idfs = np.array(idfs)

  BM25_score = BM25_tf * idfs
  return pd.DataFrame(BM25_score, columns=vocabulary)
bm25_df = BM25_IDF_df(df)
bm25_df.index = list(ep_IDs)

In [21]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values, score_q_d.values),
                key = lambda tup:tup[1],
                reverse=True)

In [37]:
dummy_queries = {1: 'people', 2: 'hello world'}
print(dummy_queries)
queries = dummy_queries

{1: 'people', 2: 'hello world'}


In [23]:
for count, query in enumerate(queries.values()):
  print(f'Query {count}: {query}')
  print('')
  print(retrieve_ranking(query, bm25_df))
  print('')

# example retrieve
# retrieve_ranking('people', bm25_df)

Query 0: people

[('00irXoP4wc6yOoC67PyKg5', 0.14762428278751952), ('00i4t4ifo0QAz95oZlnUiV', 0.14743213680058057), ('00bUBQRAVOMU42g1gXlzlX', 0.14731189130670203), ('00btWOTB484ro3OgZKDle6', 0.1468988247685183), ('00d5EDsEf5qmUhoF8m5IvJ', 0.14628263933962635), ('00CoLqpC5J4vqYzGa7SDkS', 0.14627131890129733), ('00e3nyxpqhhreydZjZag7H', 0.14462299996682218), ('00f13q3KxUWM67GjNv12ij', 0.14293746010670766), ('000A9sRBYdVh66csG2qEdj', 0.14237745198122587), ('00cpOS24Y5536ZbzDirP3K', 0.13931504778382156), ('00BlbbH2PvBJ8M3CApZ7ou', 0.1387181170818666), ('00gQxUFKCvFhYQfZNFofo6', 0.13837078072364575), ('00eo6cCvQbPvesSzZ18Vnm', 0.13822928847340865), ('00ENEumWcPZcXVgmJdMYMw', 0.1380457344072328), ('00h18NlParejEuFrdCf5dC', 0.13760759928093613), ('00iI06WDE5LoQk0a0Z3pfj', 0.13745348686704156), ('00dGTzs1TgUbVtZnWL6qT8', 0.13657065895974047), ('00APTUqFgGqIgcdRz0G6Gg', 0.13411150890377357), ('00HGJXXGgvppuVCvdyEO4B', 0.13381735383551965), ('00gQsEcqYYZNDhnsFlPSfi', 0.1332758686102517), ('00HH

In [39]:
dummy_qrels = [[1, '00irXoP4wc6yOoC67PyKg5', 1],
 [1, '00i4t4ifo0QAz95oZlnUiV', 0], [1, '000HP8n3hNIfglT2wSI2cA', 1],
 [2, '00d5EDsEf5qmUhoF8m5IvJ', 1], [2, '000A9sRBYdVh66csG2qEdj', 1],
 [2, '00A08geZvrEL1woiaAFT9x', 0]]
qrels = dummy_qrels

In [40]:
k=3
def precision_at_k(query_id, k):
  """This function considers the k top ranking documents."""
  doc_ranking = retrieve_ranking(queries[query_id], bm25_df)

  # take only the document id, rather than the score
  retrieved = [doc[0] for doc in doc_ranking[:k]]
  print(retrieved)
  retrieved_relevant = [ep_ID for ep_ID in retrieved if [query_id, ep_ID, 1] in qrels]
  
  print(retrieved_relevant)
  TP = len(retrieved_relevant)  # number of true positives
  FP = k - TP  # number of false positives
  precision = TP / k

  return TP, FP, precision

def print_precision_for_all_queries(k):
  for query_id, query in queries.items():
    TP, FP, precision = precision_at_k(query_id, k=k) 
    print(f'retrieved query "{query}" with precision @ {k}: {precision} (TP: {TP}, FP: {FP})')

print_precision_for_all_queries(k)

['00irXoP4wc6yOoC67PyKg5', '00i4t4ifo0QAz95oZlnUiV', '00bUBQRAVOMU42g1gXlzlX']
['00irXoP4wc6yOoC67PyKg5']
retrieved query "people" with precision @ 3: 0.3333333333333333 (TP: 1, FP: 2)
['00jdHz1eigCLYp4dLBrBae', '00d5EDsEf5qmUhoF8m5IvJ', '000A9sRBYdVh66csG2qEdj']
['00d5EDsEf5qmUhoF8m5IvJ', '000A9sRBYdVh66csG2qEdj']
retrieved query "hello world" with precision @ 3: 0.6666666666666666 (TP: 2, FP: 1)


# Unit testing

In [18]:
# Import libraries
import unittest

class test_Tom(unittest.TestCase):

    def test_get_qrels(self):
        qrels_file = 'qrels_test_file.txt'
        qrels_result = [[7, '1xxxx', 1],
                        [8, '2xxxx', 2],
                        [9, '3xxxx', 0]]
        self.assertCountEqual(get_qrels(qrels_file),
                              qrels_result)

    def test_get_queries(self):
        queries_file = 'queries_test_file.xml'
        queries_result = {1 : 'How do I get fit?',
                          2 : 'What is Barack Obamas middle name?'}
        self.assertDictEqual(get_queries(queries_file),
                             queries_result)

    def test_get_transcripts(self):
        path = '/Transcripts/*'
        files = glob.glob(path) 
        ep_IDs_result = ["1a", "2b"]
        corpus_result = [
            "Hi and welcome to this podcast about podcasts.Today, we will be talking about podcasts.",
            "It was probably misleading to call this a football podcast. Episode 1 will be about cheese, and I'm not promising it will ever actually come round to football."
        ]
        titles_result = [
            "The podcast show - The first episode"
            "Football or something - Let's not bother starting with football."
            ]
        ep_IDs, corpus, titles = get_transcripts()
        self.assertCountEqual(ep_IDs, ep_IDs_result)
        self.assertCountEqual(corpus, corpus_result)
        self.assertCountEqual(titles, titles_result)


    def test_BM25_IDF_df(self):
        import pandas.testing as pd_testing

        doc_index_dict = {
            'obama' : [0, 0, 1, 0, 1],
            'middle' : [1, 0, 0, 0, 0],
            'spotify' : [0, 0, 0, 1, 0]
        }
        doc_index_df = pd.DataFrame(doc_index_dict)
        doc_index_episode_ids = ['0xxxx','1xxxx', '2xxxx', '3xxxx', '4xxxx']
        doc_index_df.index = doc_index_episode_ids

        doc_index_result_dict = {
            'obama' : [0, 0, 0.9881566716289908, 0, 0.9881566716289908],
            'middle' : [1.7356683369387358, 0, 0, 0, 0],
            'spotify' : [0, 1.0499164636058027, 0, 1, 0.9881566716289908]                    
        }
        doc_index_result_df = pd.DataFrame(doc_index_result_dict)

        pd_testing.assert_frame_equal(Tom.BM25_IDF_df(doc_index_df),
                                      doc_index_result_df)