In [1]:
import ir_text

In [2]:
from os import listdir
from os.path import join

path = join('data', 'datasets')
datasets = list(dict.fromkeys([file.split('_')[0] for file in listdir(path)]))

dataset = datasets[0]
print('Available datasets :', datasets)
print('Current dataset :', dataset)

data_path = join(path, dataset + '_dataset.json')
queries_path = join(path, dataset + '_queries.json')
groundtruth_path = join(path, dataset + '_groundtruth.json')

Available datasets : ['cran', 'time', 'cisi', 'lisa', 'cacm', 'med']
Current dataset : cran


In [3]:
import json

with open(data_path, 'r') as json_data:
    data = json.load(json_data)
print('Data length :', len(data['dataset']))

with open(queries_path, 'r') as json_queries:
    queries = json.load(json_queries)
print('Queries length :', len(queries['queries']))

with open(groundtruth_path, 'r') as json_groundtruth:
    groundtruth = json.load(json_groundtruth)
print('Ground truth length :', len(groundtruth['groundtruth']))

Data length : 1398
Queries length : 225
Ground truth length : 225


## Linear Index

### Construct

In [4]:
linear_index = ir_text.LinearIndex(data['dataset'], language = 'english')
linear_index.construct()

### Search - Dice coefficient

In [5]:
from time import time

results_linear = {'groundtruth' : []}

start = time()
for query in queries['queries'] : 
    results_linear['groundtruth'].append({'relevant' : linear_index.search(query), 'id' : query['id']})
stop = time()
print('Searched for', len(results_linear['groundtruth']), 'queries in', stop - start, 'seconds')
print('Average speed :', (stop - start) / len(results_linear['groundtruth']), 'query/s')

Searched for 225 queries in 7.493617057800293 seconds
Average speed : 0.033304964701334636 query/s


## Inverted Index

### Construct

In [4]:
inverted_index = ir_text.InvertedIndex(data['dataset'], language = 'english')
inverted_index.construct()

### Search - Dice coefficient

In [5]:
from time import time

results_inverted_dice = {'groundtruth' : []}

start = time()
for query in queries['queries'] : 
    results_inverted_dice['groundtruth'].append({'relevant' : inverted_index.search(query, ir_text.Measures.DICE), 'id' : query['id']})
stop = time()
print('Searched for', len(results_inverted_dice['groundtruth']), 'queries in', stop - start, 'seconds')
print('Average speed :', (stop - start) / len(results_inverted_dice['groundtruth']), 'query/s')

Searched for 225 queries in 7.493617057800293 seconds
Average speed : 0.033304964701334636 query/s


### Search - Term Frequency

In [5]:
from time import time

results_inverted_tf = {'groundtruth' : []}

start = time()
for query in queries['queries'] : 
    results_inverted_tf['groundtruth'].append({'relevant' : inverted_index.search(query, ir_text.Measures.TF), 'id' : query['id']})
stop = time()
print('Searched for', len(results_inverted_tf['groundtruth']), 'queries in', stop - start, 'seconds')
print('Average speed :', (stop - start) / len(results_inverted_tf['groundtruth']), 'query/s')

Searched for 225 queries in 7.493617057800293 seconds
Average speed : 0.033304964701334636 query/s



### Construct - Inverse Document Frequency

In [4]:
inverted_index_idf = ir_text.InvertedIndex(data['dataset'], language = 'english')
inverted_index_idf.construct()

### Search - TFIDF

In [5]:
from time import time

results_inverted_tfidf = {'groundtruth' : []}

start = time()
for query in queries['queries'] : 
    results_inverted_tfidf['groundtruth'].append({'relevant' : inverted_index_idf.search(query, ir_text.Measures.TF), 'id' : query['id']})
stop = time()
print('Searched for', len(results_inverted_tfidf['groundtruth']), 'queries in', stop - start, 'seconds')
print('Average speed :', (stop - start) / len(results_inverted_tfidf['groundtruth']), 'query/s')

Searched for 225 queries in 7.493617057800293 seconds
Average speed : 0.033304964701334636 query/s


## Evaluation

### Linear Index
Variable : ```results_linear```

### Inverted Index
Variables : ```results_inverted_dice```, ```results_inverted_tf``` and ```results_inverted_tfidf```