In [1]:
"""Convert source document to tkeir indexer document
Author : Eric Blaudez (Eric Blaudez)

Copyright (c) 2022 THALES 
All Rights Reserved.
"""
import os
import sys
import traceback
import pandas as pd
import json
import numpy as np


dir_path = os.path.abspath('')

sys.path.insert(0, os.path.abspath(os.path.join(dir_path, "../../")))
sys.path.insert(0, os.path.abspath(os.path.join(dir_path, "../")))
sys.path.insert(0, os.path.abspath(os.path.join(dir_path, "./")))

from thot.tasks.evaluation.IREval import IREval

# Initialize Axeleria Information Retrieval Evaluation module

In [2]:
ireval=IREval()

In [3]:
eval_file="/home/tkeir_svc/tkeir/configs/axeleria/test/evalrequest-human-writing-run1.json" 
eval_name="dryrun"
output_dir="/tmp"

# Evaluation

## Display queries

In [4]:
with open(eval_file) as eval_f:
    data = json.load(eval_f)
    eval_f.close()
    print(json.dumps(data, indent=2))

{
  "run-name": "Query system with the queries written by a human",
  "system-configuration": "/home/tkeir_svc/tkeir/configs/test/configs/searching.json",
  "queries": [
    {
      "description": "From Alain - general summary. Answering : Patent cn102924940",
      "query": "The invention relates to a thermoplastic composition, comprising:\n(1) an acrylonitrile-butadiene-styrene (ABS) polymer;\n(2) natural fibers;\n(3) a compatibilizing polymer; and\n(4) processing aids comprising a lubricant and titanium dioxide; wherein the amount of titanium dioxide is 1 - 10 weight%.\nA molded article prepared from the thermoplastic composition is also claimed. Use of the thermoplastic composition in extrusion, injection, compression molding, and 3D printing, is also concerned.\n",
      "target-document": "file://33f4c3d5-0a4e-4a65-874a-885736c2b114.csv:questel:8103268-line:0000001084",
      "qid": 1,
      "docid": "1",
      "relevance": 1
    },
    {
      "description": "From Alain - genera

## Run evaluation

In [5]:
ireval.evaluate(output_dir,eval_name,eval_file)

[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:00:19,719) - Level(INFO) PID(40688)] /home/tkeir_svc/tkeir/configs/test/configs/searching.json loaded.


[nltk_data] Downloading package punkt to
[nltk_data]     /home/tkeir_svc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:00:20,803) - Level(INFO) PID(40688)] Load tokenizer:/home/tkeir_svc/tkeir/thot/tests/data/tkeir_mwe.pkl
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:00:24,178) - Level(INFO) PID(40688)] Load normalization tokenizer:/home/tkeir_svc/tkeir/thot/tests/data/tokenizer-rules.json
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:00:26,935) - Level(INFO) PID(40688)] Load mwe:/home/tkeir_svc/tkeir/thot/tests/data/tkeir_mwe.pkl
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:00:33,886) - Level(INFO) PID(40688)] Load Syntactic Rules
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:00:33,891) - Level(INFO) PID(40688)] Run query:[1] : From Alain - general summary. Answering : Patent cn102924940
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:01:03,729) - Level(INFO) PID(40688)] Run query:[2] : From - Alain General summary 2. Answering : Patent cn102924940
[

# Display results

In [6]:
df = pd.read_csv("/tmp/results.dryrun.csv")
print(df.head())

  query  P_10  P_100  P_1000      P_15  P_20  P_200      P_30  P_5  P_500  \
0   all   0.2   0.02   0.002  0.133333   0.1   0.01  0.066667  0.4  0.004   

   ...   TFN_100     R_200    F1_200   TFN_200     R_500    F1_500   TFN_500  \
0  ...  0.120289  0.166667  0.018868  0.060036  0.166667  0.007812  0.024002   

     R_1000   F1_1000 TFN_1000  
0  0.166667  0.003953    0.012  

[1 rows x 41 columns]


## Sample sizes


In [7]:
print("Number of queries:",df["num_q"].values[0])
print("Number of relevant documents:",df["num_rel"].values[0])
print("Number of relevant documents returned:",df["num_rel_ret"].values[0])


Number of queries: 3
Number of relevant documents: 12
Number of relevant documents returned: 6


## Precisions, Recall, F1, False Negative Rate @K


In [8]:
df_prec = pd.DataFrame([
    [5,df["P_5"].values[0]],
    [10,df["P_10"].values[0]],
    [15,df["P_15"].values[0]],
    [20,df["P_20"].values[0]],
    [30,df["P_30"].values[0]],
    [100,df["P_100"].values[0]],
    [200,df["P_200"].values[0]],
    [500,df["P_500"].values[0]],
    [1000,df["P_1000"].values[0]]],columns=["@K","precision"])
df_rec = pd.DataFrame([
    [15,df["R_15"].values[0]],
    [20,df["R_20"].values[0]],
    [30,df["R_30"].values[0]],
    [100,df["R_100"].values[0]],
    [200,df["R_200"].values[0]],
    [500,df["R_500"].values[0]],
    [1000,df["R_1000"].values[0]]],columns=["@K","recall"])
df_f1 = pd.DataFrame([    
    [15,df["F1_15"].values[0]],
    [20,df["F1_20"].values[0]],
    [30,df["F1_30"].values[0]],
    [100,df["F1_100"].values[0]],
    [200,df["F1_200"].values[0]],
    [500,df["F1_500"].values[0]],
    [1000,df["F1_1000"].values[0]]],columns=["@K","f1"])
df_fn = pd.DataFrame([    
    [15,df["TFN_15"].values[0]],
    [20,df["TFN_20"].values[0]],
    [30,df["TFN_30"].values[0]],
    [100,df["TFN_100"].values[0]],
    [200,df["TFN_200"].values[0]],
    [500,df["TFN_500"].values[0]],
    [1000,df["TFN_1000"].values[0]]],columns=["@K","false negative rate"])


In [9]:
df_prec

Unnamed: 0,@K,precision
0,5,0.4
1,10,0.2
2,15,0.133333
3,20,0.1
4,30,0.066667
5,100,0.02
6,200,0.01
7,500,0.004
8,1000,0.002


In [10]:
df_rec

Unnamed: 0,@K,recall
0,15,0.166667
1,20,0.166667
2,30,0.166667
3,100,0.166667
4,200,0.166667
5,500,0.166667
6,1000,0.166667


In [11]:
df_f1

Unnamed: 0,@K,f1
0,15,0.148148
1,20,0.125
2,30,0.095238
3,100,0.035714
4,200,0.018868
5,500,0.007812
6,1000,0.003953


In [12]:
df_fn

Unnamed: 0,@K,false negative rate
0,15,0.895522
1,20,0.638298
2,30,0.410959
3,100,0.120289
4,200,0.060036
5,500,0.024002
6,1000,0.012


## Other metrics

In [13]:
print("MAP (Mean Average Precision):",df["map"].values[0])

MAP (Mean Average Precision): 0.5


## Quick run on query

In [14]:
ireval.evaluateQuery("natural wood fiber","file://33f4c3d5-0a4e-4a65-874a-885736c2b114.csv:questel:8103268-line:0000001084","/home/tkeir_svc/tkeir/configs/test/configs/searching.json","/tmp/","customquery")

[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:11,104) - Level(INFO) PID(40688)] /home/tkeir_svc/tkeir/configs/test/configs/searching.json loaded.
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:11,104) - Level(INFO) PID(40688)] /home/tkeir_svc/tkeir/configs/test/configs/searching.json loaded.


[nltk_data] Downloading package punkt to
[nltk_data]     /home/tkeir_svc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:11,809) - Level(INFO) PID(40688)] Load tokenizer:/home/tkeir_svc/tkeir/thot/tests/data/tkeir_mwe.pkl
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:11,809) - Level(INFO) PID(40688)] Load tokenizer:/home/tkeir_svc/tkeir/thot/tests/data/tkeir_mwe.pkl
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:14,638) - Level(INFO) PID(40688)] Load normalization tokenizer:/home/tkeir_svc/tkeir/thot/tests/data/tokenizer-rules.json
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:14,638) - Level(INFO) PID(40688)] Load normalization tokenizer:/home/tkeir_svc/tkeir/thot/tests/data/tokenizer-rules.json
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:17,299) - Level(INFO) PID(40688)] Load mwe:/home/tkeir_svc/tkeir/thot/tests/data/tkeir_mwe.pkl
[ThotLogger.py:102 -                 info()][Time(2021-05-31 12:02:17,299) - Level(INFO) PID(40688)] Load mwe:/home/tkeir_sv

In [15]:
df = pd.read_csv("/tmp/results.customquery.csv")
print(df.head())

  query  P_10  P_100  P_1000      P_15  P_20  P_200      P_30  P_5  P_500  \
0   all   0.1   0.01   0.001  0.066667  0.05  0.005  0.033333  0.2  0.002   

   ...   TFN_100  R_200   F1_200  TFN_200  R_500    F1_500  TFN_500  R_1000  \
0  ...  0.010001    1.0  0.00995    0.005    1.0  0.003992    0.002     1.0   

    F1_1000 TFN_1000  
0  0.001998    0.001  

[1 rows x 47 columns]


In [16]:
print("Number of queries:",df["num_q"].values[0])
print("Number of relevant documents:",df["num_rel"].values[0])
print("Number of relevant documents returned:",df["num_rel_ret"].values[0])
df_prec = pd.DataFrame([
    [5,df["P_5"].values[0]],
    [10,df["P_10"].values[0]],
    [15,df["P_15"].values[0]],
    [20,df["P_20"].values[0]],
    [30,df["P_30"].values[0]],
    [100,df["P_100"].values[0]],
    [200,df["P_200"].values[0]],
    [500,df["P_500"].values[0]],
    [1000,df["P_1000"].values[0]]],columns=["@K","precision"])
df_rec = pd.DataFrame([
    [5,df["R_5"].values[0]],
    [10,df["R_10"].values[0]],
    [15,df["R_15"].values[0]],
    [20,df["R_20"].values[0]],
    [30,df["R_30"].values[0]],
    [100,df["R_100"].values[0]],
    [200,df["R_200"].values[0]],
    [500,df["R_500"].values[0]],
    [1000,df["R_1000"].values[0]]],columns=["@K","recall"])
df_f1 = pd.DataFrame([    
    [5,df["F1_5"].values[0]],
    [10,df["F1_10"].values[0]],
    [15,df["F1_15"].values[0]],
    [20,df["F1_20"].values[0]],
    [30,df["F1_30"].values[0]],
    [100,df["F1_100"].values[0]],
    [200,df["F1_200"].values[0]],
    [500,df["F1_500"].values[0]],
    [1000,df["F1_1000"].values[0]]],columns=["@K","f1"])
df_fn = pd.DataFrame([
    [5,df["TFN_5"].values[0]],
    [10,df["TFN_5"].values[0]],
    [15,df["TFN_15"].values[0]],
    [20,df["TFN_20"].values[0]],
    [30,df["TFN_30"].values[0]],
    [100,df["TFN_100"].values[0]],
    [200,df["TFN_200"].values[0]],
    [500,df["TFN_500"].values[0]],
    [1000,df["TFN_1000"].values[0]]],columns=["@K","false negative rate"])

Number of queries: 1
Number of relevant documents: 1
Number of relevant documents returned: 1


In [17]:
df_prec

Unnamed: 0,@K,precision
0,5,0.2
1,10,0.1
2,15,0.066667
3,20,0.05
4,30,0.033333
5,100,0.01
6,200,0.005
7,500,0.002
8,1000,0.001


In [18]:
df_rec

Unnamed: 0,@K,recall
0,5,1.0
1,10,1.0
2,15,1.0
3,20,1.0
4,30,1.0
5,100,1.0
6,200,1.0
7,500,1.0
8,1000,1.0


In [19]:
df_f1

Unnamed: 0,@K,f1
0,5,0.333333
1,10,0.181818
2,15,0.125
3,20,0.095238
4,30,0.064516
5,100,0.019802
6,200,0.00995
7,500,0.003992
8,1000,0.001998


In [20]:
df_fn

Unnamed: 0,@K,false negative rate
0,5,0.208333
1,10,0.208333
2,15,0.066964
3,20,0.050125
4,30,0.03337
5,100,0.010001
6,200,0.005
7,500,0.002
8,1000,0.001
