In [22]:
import numpy as np
import pandas as pd 

In [23]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        with open(os.path.join(dirname, filename)) as f:
            line_count = 0
            id_set = set()
            for l in f.readlines():
                line_count += 1
                if filename == "CISI.REL":
                    id_set.add(l.lstrip(" ").split(" ")[0])
                elif l.startswith(".I "):
                    id_set.add(l.split(" ")[1].strip())
            print(f"{filename} : {len(id_set)} items, over {line_count} lines.")

In [24]:
with open('datasets/CISI.ALL') as f:
    lines = ""
    for l in f.readlines():
        lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
    lines = lines.lstrip("\n").split("\n")
    
# print n lines
n = 5
for l in lines[:n]:
    print(l)

.I 1
.T 18 Editions of the Dewey Decimal Classifications
.A Comaromi, J.P.
.W The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad.
.X 1	5	1 92	1	1 262	1	1 556	1	1 1004	1	1 1024	1	1 1024	1	1


### Put each DOCUMENT into a dictionary ###

In [25]:
doc_set = {}
doc_id = ""
doc_text = ""
for l in lines:
    if l.startswith(".I"):
        doc_id = l.split(" ")[1].strip()
    elif l.startswith(".X"):
        doc_set[doc_id] = doc_text.lstrip(" ")
        doc_id = ""
        doc_text = ""
    else:
        doc_text += l.strip()[3:] + " " # The first 3 characters of a line can be ignored.

# Print something to see the dictionary structure, etc.
print(f"Number of documents = {len(doc_set)}" + ".\n")
print(doc_set["3"]) # note that the dictionary indexes are strings, not numbers. 

Number of documents = 1460.

Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organization and control of knowledge and information will inevitably enter our story, for writings contain, along with much else, a great deal of mankind's stock of knowledge and information.  Bibliographical control is a form of power, and if knowledge itself is a form of power, as the familiar slogan claims, bibliographical control is in a certain sense power over power, power to obtain the knowledge recorded in written form.  As writings are not simply, and not in any simple way, storehouses of knowledge, we cannot satisfactorily discuss bibliographical control as simply control over the knowledge and information contained in writings. 


In [26]:
print(doc_set.keys())

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157'

### Repeat with QUERY ###

In [27]:
with open('datasets/CISI.QRY') as f:
    lines = ""
    for l in f.readlines():
        lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
    lines = lines.lstrip("\n").split("\n")
    
qry_set = {}
qry_id = ""
for l in lines:
    if l.startswith(".I"):
        qry_id = l.split(" ")[1].strip()
    elif l.startswith(".W"):
        qry_set[qry_id] = l.strip()[3:]
        qry_id = ""
    
# Print something to see the dictionary structure, etc.
print(f"Number of queries = {len(qry_set)}" + ".\n")
print(qry_set["3"]) # note that the dictionary indexes are strings, not numbers. 

Number of queries = 112.

What is information science?  Give definitions where possible.


### Do the same with query => document MAPPING ###

In [29]:
rel_set = {}
with open('datasets/CISI.REL') as f:
    for l in f.readlines():
        qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]
        doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]
        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)
        if qry_id == "7":
            print(l.strip("\n"))
    
# Print something to see the dictionary structure, etc.
print(f"\nNumber of mappings = {len(rel_set)}" + ".\n")
print(rel_set["7"]) # note that the dictionary indexes are strings, not numbers. 

     7    310	0	0.000000
     7    320	0	0.000000
     7    332	0	0.000000
     7    375	0	0.000000
     7    376	0	0.000000
     7    645	0	0.000000
     7    724	0	0.000000
     7    725	0	0.000000

Number of mappings = 76.

['310', '320', '332', '375', '376', '645', '724', '725']


In [30]:
print('Read %s documents, %s queries and %s mappings from CISI dataset' % 
      (len(doc_set), len(qry_set), len(rel_set)))

number_of_rel_docs = [len(value) for key, value in rel_set.items()]
print('Average %.2f and %d min number of relevant documents by query ' % 
      (np.mean(number_of_rel_docs), np.min(number_of_rel_docs)))

print('Queries without relevant documents: ', 
      np.setdiff1d(list(qry_set.keys()),list(rel_set.keys())))

Read 1460 documents, 112 queries and 76 mappings from CISI dataset
Average 40.97 and 1 min number of relevant documents by query 
Queries without relevant documents:  ['103' '105' '106' '107' '108' '110' '112' '36' '38' '40' '47' '48' '51'
 '53' '59' '60' '63' '64' '68' '70' '72' '73' '74' '75' '77' '78' '80'
 '83' '85' '86' '87' '88' '89' '91' '93' '94']


In [31]:
qry_set.keys() #index of the queries

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112'])

In [32]:
import random

In [33]:
random.seed(42)
idx = random.sample(rel_set.keys(),1)[0]

print('Query ID %s ==>' % idx, qry_set[idx])
rel_docs = rel_set[idx]
print('Documents relevants to Query ID %s' % idx, rel_docs)
sample_document_idx = random.sample(rel_docs,1)[0]
print('Document ID %s ==>' % sample_document_idx, doc_set[sample_document_idx])

Query ID 15 ==> How much do information retrieval and dissemination systems, as well as automated libraries, cost? Are they worth it to the researcher and to industry?
Documents relevants to Query ID 15 ['18', '27', '36', '49', '56', '59', '67', '74', '83', '126', '158', '164', '167', '192', '214', '222', '223', '250', '281', '292', '295', '299', '307', '331', '336', '338', '348', '365', '366', '367', '368', '372', '381', '446', '458', '465', '466', '482', '490', '491', '495', '497', '507', '520', '528', '591', '594', '623', '629', '639', '690', '720', '723', '724', '727', '728', '731', '779', '822', '834', '839', '848', '849', '865', '872', '897', '1100', '1161', '1248', '1305', '1353', '1358', '1363', '1366', '1368', '1371', '1372', '1374', '1375', '1376', '1377', '1410']
Document ID 49 ==> Adaptive Information Dissemination Sage, C.R. Anderson, R.R. Fitzwater, D.R. Computer dissemination of information offers significant advantages over manual dissemination because the computer can 

since Python 3.9 and will be removed in a subsequent version.
  idx = random.sample(rel_set.keys(),1)[0]


### Index CISI dataset using BM25 ###

In [34]:
from rank_bm25 import BM25Okapi

In [35]:
query = qry_set[idx] #get query text
rel_docs = rel_set[idx] #get relevant documents

# Index all documents using BM25
corpus = list(doc_set.values())
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# Process query and get scores for each indexed document using BM25
tokenized_query = query.split(" ")
print('Query ==> ', query, '\nRelevant documents IDs: ==> ', rel_docs)
scores = bm25.get_scores(tokenized_query)
print(scores, len(scores), len(doc_set))

Query ==>  How much do information retrieval and dissemination systems, as well as automated libraries, cost? Are they worth it to the researcher and to industry? 
Relevant documents IDs: ==>  ['18', '27', '36', '49', '56', '59', '67', '74', '83', '126', '158', '164', '167', '192', '214', '222', '223', '250', '281', '292', '295', '299', '307', '331', '336', '338', '348', '365', '366', '367', '368', '372', '381', '446', '458', '465', '466', '482', '490', '491', '495', '497', '507', '520', '528', '591', '594', '623', '629', '639', '690', '720', '723', '724', '727', '728', '731', '779', '822', '834', '839', '848', '849', '865', '872', '897', '1100', '1161', '1248', '1305', '1353', '1358', '1363', '1366', '1368', '1371', '1372', '1374', '1375', '1376', '1377', '1410']
[13.84492303 12.72656528 17.02184487 ... 12.72737224 14.26660278
 14.10298505] 1460 1460
