In [1]:
import ir_text
from time import time

In [2]:
from os import listdir
from os.path import join

path = join('data', 'datasets')
datasets = list(dict.fromkeys([file.split('_')[0] for file in listdir(path)]))

dataset = datasets[5]
print('Available datasets :', datasets)
print('Current dataset :', dataset)

data_path = join(path, dataset + '_dataset.json')
queries_path = join(path, dataset + '_queries.json')
groundtruth_path = join(path, dataset + '_groundtruth.json')

Available datasets : ['cran', 'time', 'cisi', 'lisa', 'cacm', 'med']
Current dataset : med


In [3]:
import json

with open(data_path, 'r') as json_data:
    data = json.load(json_data)
print('Data length :', len(data['dataset']))

with open(queries_path, 'r') as json_queries:
    queries = json.load(json_queries)
print('Queries length :', len(queries['queries']))

with open(groundtruth_path, 'r') as json_groundtruth:
    groundtruth = json.load(json_groundtruth)
print('Ground truth length :', len(groundtruth['groundtruth']))

Data length : 1033
Queries length : 30
Ground truth length : 30


## Linear Index

### Construct

In [4]:
linear_index = ir_text.LinearIndex(data['dataset'], language = 'english')

start = time()
linear_index.construct()
print("Constructed in", round(time() - start, 3), 's')

Constructed in 2.423 s


### Search - Dice coefficient

In [5]:
results_linear = {'groundtruth' : []}
id_relevant = 1

start = time()
for query in queries['queries']: 
    # We'll only keep the 50 best results
    results_linear['groundtruth'].append({'relevant' : [article_id for article_id, score in linear_index.search(query)][:50], 'id' : id_relevant})
    id_relevant += 1
stop = time()
print('Searched for', len(results_linear['groundtruth']), 'queries in', round(stop - start, 3), 'seconds')
print('Average speed :', round((stop - start) / len(results_linear['groundtruth']), 3), 'query/s')

Searched for 30 queries in 0.804 seconds
Average speed : 0.027 query/s


### Evaluation

## Inverted Index

### Construct

In [6]:
inverted_index = ir_text.InvertedIndex(data['dataset'], language = 'english')
start = time()
inverted_index.construct()
print("Constructed in", round(time() - start, 3), 's')

100%|██████████| 1033/1033 [00:02<00:00, 404.57it/s]
100%|██████████| 1033/1033 [00:02<00:00, 365.27it/s]

Constructed in 2.836 s





### Search - Dice coefficient

In [7]:
results_inverted_dice = {'groundtruth' : []}
id_relevant = 1

start = time()
for query in queries['queries']: 
    # We'll only keep the 50 best results
    results_inverted_dice['groundtruth'].append({'relevant' : [article_id for article_id, score in inverted_index.search(query)][:50], 'id' : id_relevant})
    id_relevant += 1
stop = time()
print('Searched for', len(results_inverted_dice['groundtruth']), 'queries in', round(stop - start, 3), 'seconds')
print('Average speed :', round((stop - start) / len(results_inverted_dice['groundtruth']), 3), 'query/s')

Searched for 30 queries in 0.395 seconds
Average speed : 0.013 query/s


### Search - Term Frequency

In [8]:
results_inverted_tf = {'groundtruth' : []}
id_relevant = 1

start = time()
for query in queries['queries']: 
    # We'll only keep the 50 best results
    results_inverted_tf['groundtruth'].append({'relevant' : [article_id for article_id, score in inverted_index.search(query)][:50], 'id' : id_relevant})
    id_relevant += 1
stop = time()
print('Searched for', len(results_inverted_tf['groundtruth']), 'queries in', round(stop - start, 3), 'seconds')
print('Average speed :', round((stop - start) / len(results_inverted_tf['groundtruth']), 3), 'query/s')

Searched for 30 queries in 0.383 seconds
Average speed : 0.013 query/s



### Construct - Inverse Document Frequency

In [9]:
inverted_index_idf = ir_text.InvertedIndex(data['dataset'], language = 'english')

start = time()
inverted_index_idf.construct()
print("Constructed in", round(time() - start, 3), 's')

100%|██████████| 1033/1033 [00:02<00:00, 440.20it/s]
100%|██████████| 1033/1033 [00:02<00:00, 392.97it/s]

Constructed in 2.636 s





### Search - TFIDF

In [10]:
results_inverted_tfidf = {'groundtruth' : []}
id_relevant = 1

start = time()
for query in queries['queries']: 
    # We'll only keep the 50 best results
    results_inverted_tfidf['groundtruth'].append({'relevant' : [article_id for article_id, score in inverted_index.search(query)][:50], 'id' : id_relevant})
    id_relevant += 1
stop = time()
print('Searched for', len(results_inverted_tfidf['groundtruth']), 'queries in', round(stop - start, 3), 'seconds')
print('Average speed :', round((stop - start) / len(results_inverted_tfidf['groundtruth']), 3), 'query/s')

Searched for 30 queries in 0.378 seconds
Average speed : 0.013 query/s


## Evaluation

### Execution time
Variable : ```results_linear```

Index|Construction | Search|Total
-----|-------------|-------------|-----
Linear Index|1s | 2| 1
Inverted Index - Dice|1s | 2| 1
Inverted Index - TF|1s | 2| 1
Inverted Index - TFIDF|1s | 2| 1

### Inverted Index
Variables : ```results_inverted_dice```, ```results_inverted_tf``` and ```results_inverted_tfidf```