In [1]:
import pandas as pd
import numpy as np
import sklearn

## Importing evaluation results

In [2]:
# BLEU results
bleu_chain = pd.read_pickle('bleu_chain.pkl')
bleu_mmr = pd.read_pickle('bleu_mmr.pkl')
bleu_compress = pd.read_pickle('bleu_compress.pkl')

bleu_chain.head()

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.140787,"[0.36363636363636365, 0.23255813953488372, 0.0...",1.0,1.76,44,25
1,0.040756,"[0.3404255319148936, 0.08695652173913043, 0.04...",0.551152,0.626667,47,75
2,0.164405,"[0.3220338983050847, 0.1724137931034483, 0.122...",1.0,2.269231,59,26
3,0.030302,"[0.4716981132075472, 0.11538461538461539, 0.03...",0.374886,0.504762,53,105
4,0.489862,"[0.8333333333333334, 0.7021276595744681, 0.652...",0.701758,0.738462,48,65


In [3]:
# ROUGE results
rouge_chain = pd.read_pickle('rouge_chain.pkl')
rouge_mmr = pd.read_pickle('rouge_mmr.pkl')
rouge_compress = pd.read_pickle('rouge_compress.pkl')

rouge_chain.head()

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,0.53125,0.354839,0.34375,0.34375
1,0.224299,0.07619,0.186916,0.186916
2,0.43038,0.233766,0.405063,0.405063
3,0.28169,0.071429,0.183099,0.183099
4,0.72381,0.621359,0.685714,0.685714


In [4]:
# RAGAS results
ragas_chain = pd.read_pickle('score_chain.pkl')
ragas_mmr = pd.read_pickle('score_mmr_df.pkl')
ragas_compress = pd.read_pickle('score_compress.pkl')

ragas_chain.head(2)

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness,context_recall,context_precision
0,What is biosphere?,The biosphere is a layer of the earth that inc...,[further cycles within them biogeochemical cyc...,Biosphere is one of the layers of the Earth. I...,,,,,
1,What is fossilisation?,Fossilisation is the process where sediments f...,[sediments often on the bed of a lake or sea t...,Fossilisation is the process of parts of a dea...,,,,,


## BLEU analysis

In [5]:
bleu_chain.head(2)

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.140787,"[0.36363636363636365, 0.23255813953488372, 0.0...",1.0,1.76,44,25
1,0.040756,"[0.3404255319148936, 0.08695652173913043, 0.04...",0.551152,0.626667,47,75


In [9]:
bleu_res = pd.DataFrame({'Score_chain' : bleu_chain['bleu'],
                        'Score_mmr' : bleu_mmr['bleu'],
                        'Score_compress' : bleu_compress['bleu'],
                        'BP_chain' : bleu_chain['brevity_penalty'],
                        'BP_mmr' : bleu_mmr['brevity_penalty'],
                        'BP_compress' : bleu_compress['brevity_penalty'],
                        'Sys_len_chain' : bleu_chain['translation_length'],
                        'Sys_len_mmr' : bleu_mmr['translation_length'],
                        'Sys_len_compress' : bleu_compress['translation_length'],
                        'Ref_len_chain' : bleu_chain['reference_length'],
                        'Ref_len_mmr' : bleu_mmr['reference_length'],
                        'Ref_len_compress' : bleu_compress['reference_length']})
bleu_res.head()

Unnamed: 0,Score_chain,Score_mmr,Score_compress,BP_chain,BP_mmr,BP_compress,Sys_len_chain,Sys_len_mmr,Sys_len_compress,Ref_len_chain,Ref_len_mmr,Ref_len_compress
0,0.140787,0.129873,0.152366,1.0,1.0,0.455794,44,40,14,25,25,25
1,0.040756,0.336317,0.096309,0.551152,0.778801,0.455794,47,60,42,75,75,75
2,0.164405,0.084423,0.0,1.0,1.0,1.0,59,53,33,26,26,26
3,0.030302,0.058295,0.025234,0.374886,0.346864,0.346864,53,51,51,105,105,105
4,0.489862,0.340445,0.280996,0.701758,0.851535,0.721422,48,56,49,65,65,65


In [10]:
bleu_res.shape

(49, 12)

In [11]:
bleu_anal = pd.DataFrame({'Mean' : bleu_res.mean(),
                          'STD' : bleu_res.std()})
bleu_anal

Unnamed: 0,Mean,STD
Score_chain,0.110954,0.111375
Score_mmr,0.112083,0.102885
Score_compress,0.080092,0.12068
BP_chain,0.868799,0.235212
BP_mmr,0.842406,0.255091
BP_compress,0.770161,0.3146
Sys_len_chain,56.612245,9.160186
Sys_len_mmr,54.632653,10.084423
Sys_len_compress,45.571429,11.989579
Ref_len_chain,49.408163,34.984472


In [12]:
bleu_anal.to_csv('bleu_anal.csv')
bleu_res.to_csv('bleu_res.csv')

## ROUGE analysis

In [13]:
rouge_chain.head()

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,0.53125,0.354839,0.34375,0.34375
1,0.224299,0.07619,0.186916,0.186916
2,0.43038,0.233766,0.405063,0.405063
3,0.28169,0.071429,0.183099,0.183099
4,0.72381,0.621359,0.685714,0.685714


In [14]:
rouge_res = pd.DataFrame({'rouge1_chain' : rouge_chain['rouge1'],
                         'rouge1_mmr' : rouge_mmr['rouge1'],
                         'rouge1_compress' : rouge_compress['rouge1'],
                         'rouge2_chain' : rouge_chain['rouge2'],
                         'rouge2_mmr' : rouge_mmr['rouge2'],
                         'rouge2_compress' : rouge_compress['rouge2'],
                         'rougeL_chain' : rouge_chain['rougeL'],
                         'rougeL_mmr' : rouge_mmr['rougeL'],
                         'rougeL_compress' : rouge_compress['rougeL'],
                         'rougeLsum_chain' : rouge_chain['rougeLsum'],
                         'rougeLsum_mmr' : rouge_mmr['rougeLsum'],
                         'rougeLsum_compress' : rouge_compress['rougeLsum']})

In [15]:
rouge_res.shape

(49, 12)

In [16]:
rouge_res.head()

Unnamed: 0,rouge1_chain,rouge1_mmr,rouge1_compress,rouge2_chain,rouge2_mmr,rouge2_compress,rougeL_chain,rougeL_mmr,rougeL_compress,rougeLsum_chain,rougeLsum_mmr,rougeLsum_compress
0,0.53125,0.508475,0.666667,0.354839,0.280702,0.411765,0.34375,0.372881,0.611111,0.34375,0.372881,0.611111
1,0.224299,0.545455,0.285714,0.07619,0.420168,0.174757,0.186916,0.495868,0.247619,0.186916,0.495868,0.247619
2,0.43038,0.4,0.45283,0.233766,0.164384,0.117647,0.405063,0.346667,0.339623,0.405063,0.346667,0.339623
3,0.28169,0.347826,0.241135,0.071429,0.132353,0.057554,0.183099,0.246377,0.156028,0.183099,0.246377,0.156028
4,0.72381,0.563636,0.672897,0.621359,0.407407,0.380952,0.685714,0.490909,0.429907,0.685714,0.490909,0.429907


In [17]:
rouge_anal = pd.DataFrame({'Mean' : rouge_res.mean(),
                          'STD' : rouge_res.std()})
rouge_anal

Unnamed: 0,Mean,STD
rouge1_chain,0.404258,0.137119
rouge1_mmr,0.419072,0.131385
rouge1_compress,0.391598,0.158469
rouge2_chain,0.217993,0.143372
rouge2_mmr,0.226912,0.137827
rouge2_compress,0.18428,0.136421
rougeL_chain,0.327278,0.130972
rougeL_mmr,0.335143,0.116902
rougeL_compress,0.307005,0.133593
rougeLsum_chain,0.327687,0.131488


In [19]:
rouge_anal.to_csv('rouge_anal.csv')
rouge_res.to_csv('rouge_res.csv')

## RAGAS analysis

In [14]:
ragas_chain.head(7)

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness,context_recall,context_precision
0,What is biosphere?,The biosphere is a layer of the earth that inc...,[further cycles within them biogeochemical cyc...,Biosphere is one of the layers of the Earth. I...,,,,,
1,What is fossilisation?,Fossilisation is the process where sediments f...,[sediments often on the bed of a lake or sea t...,Fossilisation is the process of parts of a dea...,,,,,
2,"In the context of geology and rock formaiton, ...",Degradation in geology refers to the process o...,[constituent partsdecay degradation disintegra...,Degradation the process in which a rock is bei...,,,,1.0,0.991503
3,What is metamorphism?,Metamorphism is the process where rocks underg...,[of superhot magma contact metamorphism into a...,Metamorphism is the process in which rock’s ch...,0.660353,0.833333,1.0,1.0,0.736879
4,What is chemical weathering?,Chemical weathering is the transformation of r...,[area which is where the reactions of chemical...,Chemical weathering implies transformation of ...,1.0,0.8875,0.943937,0.878734,0.8875
5,What is sporopollenin?,Sporopollenin is a biological polymer found in...,[is constructed of sporopollenin a substance w...,Sporopollenin is a substance that is both stro...,1.0,1.0,0.685957,1.0,1.0
6,What is detritus?,Detritus is pieces of waste material left afte...,[animals characterised by two pairs of antenna...,Detritus are pieces of waste material left aft...,1.0,1.0,0.746577,1.0,1.0


In [17]:
ragas_mmr.head()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness,context_recall,context_precision
0,What is biosphere?,The biosphere is a layer of the earth that inc...,[further cycles within them biogeochemical cyc...,Biosphere is one of the layers of the Earth. I...,,,,,
1,What is fossilisation?,Fossilisation is the process of parts of a dea...,[sediments often on the bed of a lake or sea t...,Fossilisation is the process of parts of a dea...,,,,,
2,"In the context of geology and rock formaiton, ...",Degradation in geology refers to the process o...,[constituent partsdecay degradation disintegra...,Degradation the process in which a rock is bei...,,,,1.0,0.991503
3,What is metamorphism?,Metamorphism is the process in which the chemi...,[of superhot magma contact metamorphism into a...,Metamorphism is the process in which rock’s ch...,0.666018,1.0,1.0,,0.999999
4,What is chemical weathering?,Chemical weathering occurs when the chemical s...,[area which is where the reactions of chemical...,Chemical weathering implies transformation of ...,0.673811,1.0,1.0,1.0,0.942871


In [18]:
ragas_compress.head()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness,context_recall,context_precision
0,What is biosphere?,The biosphere is the layer of the earth that i...,"[biotic biosphere, biosphere a layer of the ea...",Biosphere is one of the layers of the Earth. I...,,,1.0,0.946266,0.614154
1,What is fossilisation?,Fossilisation is the process of preserving the...,[sediments often on the bed of a lake or sea t...,Fossilisation is the process of parts of a dea...,1.0,0.5,1.0,0.999999,0.489517
2,"In the context of geology and rock formaiton, ...",Degradation in geology refers to the process o...,[degradation the process in which a rock is be...,Degradation the process in which a rock is bei...,1.0,1.0,0.94447,0.837568,1.0
3,What is metamorphism?,Metamorphism is the process where minerals rec...,[trigger metamorphic changes through thermic a...,Metamorphism is the process in which rock’s ch...,1.0,1.0,1.0,0.684073,1.0
4,What is chemical weathering?,Chemical weathering is the process where rocks...,[chemical weathering occurs when the chemical ...,Chemical weathering implies transformation of ...,1.0,1.0,1.0,0.745603,1.0


In [25]:
ragas_chain.dropna(inplace = True)
ragas_chain.isnull().sum()

question              0
answer                0
contexts              0
ground_truth          0
faithfulness          0
answer_relevancy      0
answer_correctness    0
context_recall        0
context_precision     0
dtype: int64

In [26]:
ragas_mmr.dropna(inplace = True)
ragas_mmr.isnull().sum()

question              0
answer                0
contexts              0
ground_truth          0
faithfulness          0
answer_relevancy      0
answer_correctness    0
context_recall        0
context_precision     0
dtype: int64

In [27]:
ragas_compress.dropna(inplace = True)
ragas_compress.isnull().sum()

question              0
answer                0
contexts              0
ground_truth          0
faithfulness          0
answer_relevancy      0
answer_correctness    0
context_recall        0
context_precision     0
dtype: int64

In [28]:
ragas_compress.head(2)

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness,context_recall,context_precision
1,What is fossilisation?,Fossilisation is the process of preserving the...,[sediments often on the bed of a lake or sea t...,Fossilisation is the process of parts of a dea...,1.0,0.5,1.0,0.999999,0.489517
2,"In the context of geology and rock formaiton, ...",Degradation in geology refers to the process o...,[degradation the process in which a rock is be...,Degradation the process in which a rock is bei...,1.0,1.0,0.94447,0.837568,1.0


In [38]:
ragas_chain_res = pd.DataFrame({'Faithfulness mean' : ragas_chain['faithfulness'].mean(),
                               'Faithfullness STD' : ragas_chain['faithfulness'].std(),
                               'AnswRel mean' : ragas_chain['answer_relevancy'].mean(),
                               'AnswRel STD' : ragas_chain['answer_relevancy'].std(),
                               'AnswCorrect mean' : ragas_chain['answer_correctness'].mean(),
                               'AnswCorrect STD' : ragas_chain['answer_correctness'].std(),
                               'CntxtRecall mean' : ragas_chain['context_recall'].mean(),
                               'CntxtRecall STD' : ragas_chain['context_recall'].std(),
                               'CntxPrec mean' : ragas_chain['context_precision'].mean(),
                               'CntxPrec STD' : ragas_chain['context_precision'].std()},
                              index=[0])


In [39]:
ragas_chain_res

Unnamed: 0,Faithfulness mean,Faithfullness STD,AnswRel mean,AnswRel STD,AnswCorrect mean,AnswCorrect STD,CntxtRecall mean,CntxtRecall STD,CntxPrec mean,CntxPrec STD
0,0.880208,0.205596,0.839049,0.264387,0.780454,0.282495,0.909717,0.164112,0.829027,0.270137


In [40]:
ragas_mmr_res = pd.DataFrame({'Faithfulness mean' : ragas_mmr['faithfulness'].mean(),
                               'Faithfullness STD' : ragas_mmr['faithfulness'].std(),
                               'AnswRel mean' : ragas_mmr['answer_relevancy'].mean(),
                               'AnswRel STD' : ragas_mmr['answer_relevancy'].std(),
                               'AnswCorrect mean' : ragas_mmr['answer_correctness'].mean(),
                               'AnswCorrect STD' : ragas_mmr['answer_correctness'].std(),
                               'CntxtRecall mean' : ragas_mmr['context_recall'].mean(),
                               'CntxtRecall STD' : ragas_mmr['context_recall'].std(),
                               'CntxPrec mean' : ragas_mmr['context_precision'].mean(),
                               'CntxPrec STD' : ragas_mmr['context_precision'].std()},
                              index=[0])

In [41]:
ragas_mmr_res

Unnamed: 0,Faithfulness mean,Faithfullness STD,AnswRel mean,AnswRel STD,AnswCorrect mean,AnswCorrect STD,CntxtRecall mean,CntxtRecall STD,CntxPrec mean,CntxPrec STD
0,0.821676,0.251044,0.862858,0.220837,0.730812,0.329595,0.852865,0.249651,0.833903,0.262383


In [42]:
ragas_compress_res = pd.DataFrame({'Faithfulness mean' : ragas_compress['faithfulness'].mean(),
                               'Faithfullness STD' : ragas_compress['faithfulness'].std(),
                               'AnswRel mean' : ragas_compress['answer_relevancy'].mean(),
                               'AnswRel STD' : ragas_compress['answer_relevancy'].std(),
                               'AnswCorrect mean' : ragas_compress['answer_correctness'].mean(),
                               'AnswCorrect STD' : ragas_compress['answer_correctness'].std(),
                               'CntxtRecall mean' : ragas_compress['context_recall'].mean(),
                               'CntxtRecall STD' : ragas_compress['context_recall'].std(),
                               'CntxPrec mean' : ragas_compress['context_precision'].mean(),
                               'CntxPrec STD' : ragas_compress['context_precision'].std()},
                              index=[0])

In [43]:
ragas_compress_res

Unnamed: 0,Faithfulness mean,Faithfullness STD,AnswRel mean,AnswRel STD,AnswCorrect mean,AnswCorrect STD,CntxtRecall mean,CntxtRecall STD,CntxPrec mean,CntxPrec STD
0,0.849638,0.326608,0.97248,0.082875,0.869314,0.154827,0.713345,0.283158,0.740714,0.412984


In [49]:
ragas_res = pd.concat([ragas_chain_res, ragas_mmr_res, ragas_compress_res], ignore_index = True)

In [51]:
ragas_res

Unnamed: 0,Faithfulness mean,Faithfullness STD,AnswRel mean,AnswRel STD,AnswCorrect mean,AnswCorrect STD,CntxtRecall mean,CntxtRecall STD,CntxPrec mean,CntxPrec STD
0,0.880208,0.205596,0.839049,0.264387,0.780454,0.282495,0.909717,0.164112,0.829027,0.270137
1,0.821676,0.251044,0.862858,0.220837,0.730812,0.329595,0.852865,0.249651,0.833903,0.262383
2,0.849638,0.326608,0.97248,0.082875,0.869314,0.154827,0.713345,0.283158,0.740714,0.412984


In [53]:
ragas_res.index = ['chain', 'mmr', 'compress']

In [54]:
ragas_res

Unnamed: 0,Faithfulness mean,Faithfullness STD,AnswRel mean,AnswRel STD,AnswCorrect mean,AnswCorrect STD,CntxtRecall mean,CntxtRecall STD,CntxPrec mean,CntxPrec STD
chain,0.880208,0.205596,0.839049,0.264387,0.780454,0.282495,0.909717,0.164112,0.829027,0.270137
mmr,0.821676,0.251044,0.862858,0.220837,0.730812,0.329595,0.852865,0.249651,0.833903,0.262383
compress,0.849638,0.326608,0.97248,0.082875,0.869314,0.154827,0.713345,0.283158,0.740714,0.412984


In [56]:
ragas_res.to_csv('ragas_res.csv')