# Utils

In [24]:
import numpy as np
import Levenshtein
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

def levenshtein_distance(text1: str, text2: str) -> int:
    """
    Compute the Levenshtein distance between two strings.

    Args:
        text1 (str): The first string.
        text2 (str): The second string. 

    Returns:
        int: The Levenshtein distance between the two imput strings. 
    """
    return Levenshtein.distance(text1, text2)


def compute_similariy(df: pd.DataFrame, suffixes: list[str], similarity_fun: callable, col_prefix: str) -> pd.DataFrame:
    for suffix in suffixes:
        col_name = f'{col_prefix}{suffix}'
        df[col_name] = df.apply(
            lambda x: similarity_fun(x['pageContent_actual'], x[f'pageContent{suffix}']), axis=1
        )

    return df

def bleu_score(text1: str, text2: str) -> int:
    """
    Compute the BLEU score between two strings.

    Args:
        text1 (str): The first string.
        text2 (str): The second string. 

    Returns:
        int: The BLEU score between the two imput strings. 
    """
    return sentence_bleu(text1, text2)

# Load consolidated data

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
chitti_complete_extractions = pd.DataFrame(
    {
        'pageNumber_actual': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'pageContent_actual': ['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr', 'stu', 'vwx', 'yz', '123'],
        'pageNumber_pypdf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'pageContent_pypdf': ['AbC', 'DeF', 'GhI', 'JkL', 'MnO', 'PqR', 'StU', 'VwX', 'Yz', '123'],
        'pageNumber_pymupdf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'pageContent_pymupdf': ['aBc', 'dEf', 'gHi', 'jKl', 'mNo', 'pQr', 'sTu', 'vWx', 'yZ', '123'],
        'pageNumber_pdfminer': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'pageContent_pdfminer': ['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr', 'stu', 'vwx', 'yz', '123'],
        'extractionTimeSeconds_pypdf': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'extractionTimeSeconds_pymupdf': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'extractionTimeSeconds_pdfminer': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    }
)

In [5]:
chitti_complete_extractions

Unnamed: 0,pageNumber_actual,pageContent_actual,pageNumber_pypdf,pageContent_pypdf,pageNumber_pymupdf,pageContent_pymupdf,pageNumber_pdfminer,pageContent_pdfminer,extractionTimeSeconds_pypdf,extractionTimeSeconds_pymupdf,extractionTimeSeconds_pdfminer
0,1,abc,1,AbC,1,aBc,1,abc,0.1,0.1,0.1
1,2,def,2,DeF,2,dEf,2,def,0.2,0.2,0.2
2,3,ghi,3,GhI,3,gHi,3,ghi,0.3,0.3,0.3
3,4,jkl,4,JkL,4,jKl,4,jkl,0.4,0.4,0.4
4,5,mno,5,MnO,5,mNo,5,mno,0.5,0.5,0.5
5,6,pqr,6,PqR,6,pQr,6,pqr,0.6,0.6,0.6
6,7,stu,7,StU,7,sTu,7,stu,0.7,0.7,0.7
7,8,vwx,8,VwX,8,vWx,8,vwx,0.8,0.8,0.8
8,9,yz,9,Yz,9,yZ,9,yz,0.9,0.9,0.9
9,10,123,10,123,10,123,10,123,1.0,1.0,1.0


## Accuracy Metrics

In [6]:
metrics_df = chitti_complete_extractions.copy()

In [7]:
SUFFIXES: list[str] = ['_pypdf', '_pymupdf', '_pdfminer']

### Syntactic / Structural Metrics

In [8]:
import sys
from pathlib import Path

parent_dir = Path().resolve().parent
print(parent_dir)
sys.path.append(str(parent_dir))

/Users/prateekM/Downloads/Coding/Classes/Projects/Project Chitti


In [9]:
from utils import metrics

#### Levenshtein Distance

In [10]:
compute_similariy(
    metrics_df, SUFFIXES, levenshtein_distance, 'levenshtein'
)

Unnamed: 0,pageNumber_actual,pageContent_actual,pageNumber_pypdf,pageContent_pypdf,pageNumber_pymupdf,pageContent_pymupdf,pageNumber_pdfminer,pageContent_pdfminer,extractionTimeSeconds_pypdf,extractionTimeSeconds_pymupdf,extractionTimeSeconds_pdfminer,levenshtein_pypdf,levenshtein_pymupdf,levenshtein_pdfminer
0,1,abc,1,AbC,1,aBc,1,abc,0.1,0.1,0.1,2,1,0
1,2,def,2,DeF,2,dEf,2,def,0.2,0.2,0.2,2,1,0
2,3,ghi,3,GhI,3,gHi,3,ghi,0.3,0.3,0.3,2,1,0
3,4,jkl,4,JkL,4,jKl,4,jkl,0.4,0.4,0.4,2,1,0
4,5,mno,5,MnO,5,mNo,5,mno,0.5,0.5,0.5,2,1,0
5,6,pqr,6,PqR,6,pQr,6,pqr,0.6,0.6,0.6,2,1,0
6,7,stu,7,StU,7,sTu,7,stu,0.7,0.7,0.7,2,1,0
7,8,vwx,8,VwX,8,vWx,8,vwx,0.8,0.8,0.8,2,1,0
8,9,yz,9,Yz,9,yZ,9,yz,0.9,0.9,0.9,1,1,0
9,10,123,10,123,10,123,10,123,1.0,1.0,1.0,0,0,0


In [25]:
compute_similariy(
    metrics_df, SUFFIXES, bleu_score, 'bleu'
)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,pageNumber_actual,pageContent_actual,pageNumber_pypdf,pageContent_pypdf,pageNumber_pymupdf,pageContent_pymupdf,pageNumber_pdfminer,pageContent_pdfminer,extractionTimeSeconds_pypdf,extractionTimeSeconds_pymupdf,extractionTimeSeconds_pdfminer,levenshtein_pypdf,levenshtein_pymupdf,levenshtein_pdfminer,bleu_pypdf,bleu_pymupdf,bleu_pdfminer
0,1,abc,1,AbC,1,aBc,1,abc,0.1,0.1,0.1,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
1,2,def,2,DeF,2,dEf,2,def,0.2,0.2,0.2,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
2,3,ghi,3,GhI,3,gHi,3,ghi,0.3,0.3,0.3,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
3,4,jkl,4,JkL,4,jKl,4,jkl,0.4,0.4,0.4,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
4,5,mno,5,MnO,5,mNo,5,mno,0.5,0.5,0.5,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
5,6,pqr,6,PqR,6,pQr,6,pqr,0.6,0.6,0.6,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
6,7,stu,7,StU,7,sTu,7,stu,0.7,0.7,0.7,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
7,8,vwx,8,VwX,8,vWx,8,vwx,0.8,0.8,0.8,2,1,0,1.384293e-231,1.646211e-231,1.821832e-231
8,9,yz,9,Yz,9,yZ,9,yz,0.9,0.9,0.9,1,1,0,1.531972e-231,1.531972e-231,1.821832e-231
9,10,123,10,123,10,123,10,123,1.0,1.0,1.0,0,0,0,1.821832e-231,1.821832e-231,1.821832e-231


In [26]:
sentence_bleu?

[0;31mSignature:[0m
[0msentence_bleu[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mreferences[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhypothesis[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweights[0m[0;34m=[0m[0;34m([0m[0;36m0.25[0m[0;34m,[0m [0;36m0.25[0m[0;34m,[0m [0;36m0.25[0m[0;34m,[0m [0;36m0.25[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msmoothing_function[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mauto_reweigh[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Calculate BLEU score (Bilingual Evaluation Understudy) from
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
"BLEU: a method for automatic evaluation of machine translation."
In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf

>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
...               'ensures', 'that', 'the', '