In [19]:
import pandas as pd
def get_qrels(filename):
  # read the file from TrEC that contains the relevance scores
  with open(filename) as f:
      contents = f.read()
  # shorten the episode ID and split the time segment into seperate field
  lines = contents.replace('spotify:episode:','').replace('_','\t').split("\n")
  data = [line.split('\t') for line in lines]
  # create dataframe and remove the second column which seems to have no value
  df = pd.DataFrame(data,columns = ['query_id', 'useless','episode','segment','relevance'])
  df = df.drop('useless', axis=1)
  df['relevance'] = df['relevance'].astype(int)
  # the relevance scores are on scale 0-4, instead consider if it is relevant or not
  df['binary'] = df['relevance'] > 0
  df['binary'] = df['binary'].astype(int)
  # if an episode has relevance at 'some' point then consider the whole episode to be relevant
  df2 = df.groupby(['episode','query_id'])['binary'].max()
  # adjusting the dataframe into a list with (query_id, document_id, judgement)
  cols = ['query_id', 'episode', 'binary']
  df2 = df2.reset_index()
  df2['query_id'] = df2['query_id'].astype(int)
  qrels = df2[cols].values.tolist()
  return qrels

qrels = get_qrels("Files\/2020_train_qrels.list.txt")
qrels[:5]

[[7, '02cvhI2v4wORCGTvWQxF7P', 0],
 [8, '03G8qxp2IYaaX1RqYFwPs9', 0],
 [7, '04536ZKxCGV88Yj0TT0oYM', 0],
 [7, '047y46T88lyQEHIEEVWxgy', 0],
 [6, '04P7on6BaQikQsMmTVlMNB', 1]]

In [22]:
def get_queries(filename):
  # read the file from TrEC that contains the query titles
  with open(filename) as f:
    contents = f.read()
  from bs4 import BeautifulSoup
  soup = BeautifulSoup(contents)
  query_list = [query.text for query in soup.find_all('description')]
  # put the queries into a dictionary but need to start numbering at 1
  queries = {i+1: val for i, val in enumerate(query_list)}
  return queries
queries = get_queries('Files\podcasts_2020_topics_train.xml')

In [None]:
queries

{1: 'What were people saying about the spread of the novel coronavirus NCOV-19 in Wuhan at the end of 2019?',
 2: 'What were people saying about Greta Thunberg’s sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change?',
 3: 'In May 2019 astronomers released the first-ever picture of a black hole. I would like to hear some conversations and educational discussion about the science of astronomy, black holes, and of the picture itself.',
 4: 'I remember hearing a podcast that had a story about a kid riding some kind of bird. I want to find it again.',
 5: 'Someone told me about a podcast interview with Daniel Ek, CEO of Spotify, about the founding and early days of Spotify. I would like to find the show and episode that contains that interview. Other interviews with Ek are relevant as well.',
 6: 'Former First Lady Michelle Obama’s memoir Becoming was published in early 2019. What were people saying about it?',
 7: 'Anna Sorokina moved to 

In [25]:
import glob
import json
# read all the json files in the folder called Documents
path = 'Documents/*'
files = glob.glob(path)

def get_transcripts():
    transcripts = []
    ep_IDs = []
    titles = []
    # loop through each of the files extracting data
    for file in files:
        with open(file) as f:
          contents = json.load(f)
        #   show_ID is slightly misleading name as would not be unique so renamed ep_ID
          ep_ID = contents["showID"]
          ep_IDs.append(ep_ID)
        # the transcript is a list so change to string
        # will create a transcript+ later that also includes episode_description
          transcript = ''.join(contents["transcript"])
          transcripts.append(transcript)
          title = contents["show_name"] + " - " + contents["episode_name"]
          titles.append(title)
    return ep_IDs, transcripts, titles
ep_IDs, corpus, titles = get_transcripts()

In [26]:
# index
for i in range(len(ep_IDs)):
  print(f'{ep_IDs[i]}: {titles[i]}')

0a0C2tbL45RMmL9EmEVC2R: Do I Have A Story For You! - Do I Have A Story For You!
0a0C9jWzl6eUUhM6mxTwbn: Stay Classic: A Warcraft Podcast - Stay Classic: A Warcraft Podcast
0A0f1WM7IttoJ61xzWCPK5: Vhite Rabbit Podcast - Vhite Rabbit Podcast
0a0HuaT4Vm7FoYvccyRRQj: Back 2 HER  - Back 2 HER 
0a0Ikpt3GH8xSKaMZm4RYw: Pack-A-Day: Your Daily Packers Podcast - Pack-A-Day: Your Daily Packers Podcast
0a0iyqjSgKKZ49eOKZYpY9: The All Things Mavs Podcast - The All Things Mavs Podcast
0a0jLPxKIjaDYUaZPbhsWO: The Hawg Talk Podcast - The Hawg Talk Podcast
0A0MxX8L2YZEZgJGApE7w4: Thai Endzone Podcast - Thai Endzone Podcast
0A0rUcBRvpL436mIflNoVg: AFL Deep Dive - AFL Deep Dive
0A0SL2WQhWgGV6phvwJgU7: Variant Podcast - Variant Podcast
0A0SrnP0qm15L5Hv27sMAI: West Didsbury & Chorlton AFC - West Didsbury & Chorlton AFC
0A0TSLljzX8akWoSYqF6Hm: Unsolved Murders: True Crime Stories - Unsolved Murders: True Crime Stories
0A1bZfQ1C2FOUMfVJ7ugpa: Chompers - Chompers
0a1edJv75fmg5bnYDgeZ0v: Light and Love - Light

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# vectorize and get vocabulary
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names_out()

ModuleNotFoundError: No module named 'sklearn'

In [None]:
index = dict(enumerate(ep_IDs))
index

{0: '0a0HuaT4Vm7FoYvccyRRQj',
 1: '0a5DJZA0NWLIGWTMasTNYq',
 2: '0A0SL2WQhWgGV6phvwJgU7',
 3: '0a4HRdmYYB4jQrc0bGGPkS',
 4: '0A1lKioyXNKIBRGAzYMUQi',
 5: '0A1iNmwn0VjegiQ8kBXc4u',
 6: '0a0iyqjSgKKZ49eOKZYpY9',
 7: '0A0SrnP0qm15L5Hv27sMAI',
 8: '0a0jLPxKIjaDYUaZPbhsWO',
 9: '0A0rUcBRvpL436mIflNoVg',
 10: '0a2rz6SLuoQagFLypkidtg',
 11: '0a1edJv75fmg5bnYDgeZ0v',
 12: '0A1ejb0nplbJ7Z3qDF31D9',
 13: '0A4rSk465szkl19Zb6UXA2',
 14: '0A1bZfQ1C2FOUMfVJ7ugpa',
 15: '0a1OhqAgMKmW5JwKWG7LbF',
 16: '0A2B0uRupGRFkIszMIoni5',
 17: '0A0MxX8L2YZEZgJGApE7w4',
 18: '0a1xawFR0oGJVP672q5ZuD',
 19: '0a2BQTox8cxHkTZQzkIpf4',
 20: '0A2xwMoDNIlwkJGsaFq68I',
 21: '0a1jXZ8LRPJRYWA9mHLW7w',
 22: '0A2jf2xBqee3KXv2AUfsp7',
 23: '0a1YNZdeCkdMTl61UQgs9M',
 24: '0A0f1WM7IttoJ61xzWCPK5',
 25: '0A5BLuSdVS2uQCC8uiuO2N',
 26: '0A0TSLljzX8akWoSYqF6Hm',
 27: '0a5cpbh8YLko5O9ZzhtaBa',
 28: '0a0Ikpt3GH8xSKaMZm4RYw',
 29: '0A4HFH7rgoBjz44K0ZABXl',
 30: '0a0C2tbL45RMmL9EmEVC2R',
 31: '0A1GTvNCft6B1bVD2Guioo',
 32: '0a0C9jWzl6eU

In [None]:
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
df.index = list(ep_IDs)
df[:5]

Unnamed: 0,00,000,10,100,1000,100th,102,1099,10d,10th,...,zip,ziploc,zipping,zodiac,zombie,zombies,zone,zones,zoom,ëif
0a0HuaT4Vm7FoYvccyRRQj,0,0,0,0,0,0,2,0,0,0,...,0,0,0,5,0,0,0,0,0,0
0a5DJZA0NWLIGWTMasTNYq,1,2,2,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0A0SL2WQhWgGV6phvwJgU7,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
0a4HRdmYYB4jQrc0bGGPkS,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0A1lKioyXNKIBRGAzYMUQi,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def BM25_IDF_df(df):
  """
  This definition calculates BM25-IDF weights before hand as done last week
  """

  dfs = (df > 0).sum(axis=0)
  N = df.shape[0]
  idfs = -np.log(dfs / N)
  
  k_1 = 1.2
  b = 0.8
  dls = df.sum(axis=1) 
  avgdl = np.mean(dls)

  numerator = np.array((k_1 + 1) * df)
  denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) \
                         + np.array(df)

  BM25_tf = numerator / denominator

  idfs = np.array(idfs)

  BM25_score = BM25_tf * idfs
  return pd.DataFrame(BM25_score, columns=vocabulary)
bm25_df = BM25_IDF_df(df)
bm25_df.index = list(ep_IDs)

In [None]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values, score_q_d.values),
                key = lambda tup:tup[1],
                reverse=True)

In [None]:
# this produces error
# for count, query in enumerate(queries.values()):
#   print(f'Query {count}: {query}')
#   print('')
#   print(retrieve_ranking(query, bm25_df))
#   print('')

# retrieve for a known word
retrieve_ranking('people', bm25_df)

[('0A4rSk465szkl19Zb6UXA2', 0.4731324049359577),
 ('0a5DJZA0NWLIGWTMasTNYq', 0.47109886038189513),
 ('0a0C9jWzl6eUUhM6mxTwbn', 0.47094869404631795),
 ('0A0SL2WQhWgGV6phvwJgU7', 0.46786118578720803),
 ('0a1edJv75fmg5bnYDgeZ0v', 0.46589413105935606),
 ('0a0HuaT4Vm7FoYvccyRRQj', 0.4633682283846159),
 ('0a5cpbh8YLko5O9ZzhtaBa', 0.46252535758159874),
 ('0A0SrnP0qm15L5Hv27sMAI', 0.45977832615467756),
 ('0a0C2tbL45RMmL9EmEVC2R', 0.4589673582570983),
 ('0a0Ikpt3GH8xSKaMZm4RYw', 0.4584589742556114),
 ('0a4HRdmYYB4jQrc0bGGPkS', 0.45655152906766344),
 ('0A4ztiz48hDGPJyf0HECoH', 0.456030891044336),
 ('0A1lKioyXNKIBRGAzYMUQi', 0.45402781152438515),
 ('0a2BQTox8cxHkTZQzkIpf4', 0.4512574559199662),
 ('0a0jLPxKIjaDYUaZPbhsWO', 0.4507364639662578),
 ('0A2B0uRupGRFkIszMIoni5', 0.4500095045285661),
 ('0A1ejb0nplbJ7Z3qDF31D9', 0.4479978389630656),
 ('0a1jXZ8LRPJRYWA9mHLW7w', 0.44690753936068783),
 ('0A0rUcBRvpL436mIflNoVg', 0.4450037898419007),
 ('0a2rz6SLuoQagFLypkidtg', 0.43891998882658473),
 ('0A1iNmwn

# Unit testing

In [None]:
# Import libraries
import unittest

class test_Tom(unittest.TestCase):

    def test_get_qrels(self):
        qrels_file = 'qrels_test_file.txt'
        qrels_result = [[7, '1xxxx', 1],
                        [8, '2xxxx', 2],
                        [9, '3xxxx', 0]]
        self.assertCountEqual(get_qrels(qrels_file),
                              qrels_result)

    def test_get_queries(self):
        queries_file = 'queries_test_file.xml'
        queries_result = {1 : 'How do I get fit?',
                          2 : 'What is Barack Obamas middle name?'}
        self.assertDictEqual(get_queries(queries_file),
                             queries_result)

    def test_get_transcripts(self):
        path = '/Transcripts/*'
        files = glob.glob(path) 
        ep_IDs_result = ["1a", "2b"]
        corpus_result = [
            "Hi and welcome to this podcast about podcasts.Today, we will be talking about podcasts.",
            "It was probably misleading to call this a football podcast. Episode 1 will be about cheese, and I'm not promising it will ever actually come round to football."
        ]
        titles_result = [
            "The podcast show - The first episode"
            "Football or something - Let's not bother starting with football."
            ]
        ep_IDs, corpus, titles = get_transcripts()
        self.assertCountEqual(ep_IDs, ep_IDs_result)
        self.assertCountEqual(corpus, corpus_result)
        self.assertCountEqual(titles, titles_result)


    def test_BM25_IDF_df(self):
        import pandas.testing as pd_testing

        doc_index_dict = {
            'obama' : [0, 0, 1, 0, 1],
            'middle' : [1, 0, 0, 0, 0],
            'spotify' : [0, 0, 0, 1, 0]
        }
        doc_index_df = pd.DataFrame(doc_index_dict)
        doc_index_episode_ids = ['0xxxx','1xxxx', '2xxxx', '3xxxx', '4xxxx']
        doc_index_df.index = doc_index_episode_ids

        doc_index_result_dict = {
            'obama' : [0, 0, 0.9881566716289908, 0, 0.9881566716289908],
            'middle' : [1.7356683369387358, 0, 0, 0, 0],
            'spotify' : [0, 1.0499164636058027, 0, 1, 0.9881566716289908]                    
        }
        doc_index_result_df = pd.DataFrame(doc_index_result_dict)

        pd_testing.assert_frame_equal(Tom.BM25_IDF_df(doc_index_df),
                                      doc_index_result_df)