In [6]:
import os
import yaml
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

In [7]:
with open('../environment.yaml', 'r') as f:
    env = yaml.safe_load(f)
    os.environ.update(env)

In [2]:
def get_doc_loaders(*args:str):
    doc_loaders = []
    for file_name in args:
        doc_loaders.append(PyPDFLoader(file_name))
    return doc_loaders

In [117]:
doc_loaders = get_doc_loaders('../data/ECE-100_03.pdf', '../data/ECE-100_02 Amend 4.pdf')

In [126]:
doc_loaders[0].load_and_split()

[Document(page_content='COUNTRY :E.C.E.  © InterRegs Ltd 2021 \n \nORIGINAL :UNITED NATIONS of April 11, 1997 \n E/ECE/324 ) Rev.2/Add.99/Rev.2/Amend. 5\nE/ECE/TRANS/505 )  \nJuly 2, 2021  \n \n  \n \n \n  \n \n  \nSTATUS OF UNITED NATIONS REGULATION \n  \nECE 100-0 3 \n  \nUNIFORM PROVISIONS CONCERNING THE APPROVAL OF: \n \n \nVEHICLES WITH REGARD TO SPECIFIC \nREQUIREMENTS FOR THE ELECTRIC POWER TRAIN \n \n Incorporating: \n Supplement 1 to the 01 series of amendments Date of Entry into Force: 26.07.12 \nSupplement 2 to the 01 series of amendments Date of Entry into Force: 15.07.13 \n02 series of amendments to the Regu lation Date of Entry into Force: 15.07.13 \nSupplement 1 to the 02 series of amendments Date of Entry into Force: 10.06.14 \nSupplement 2 to the 02 series of amendments Date of Entry into Force: 29.01.16 \nSupplement 3 to the 02 series of amendments Date of Entry into Force: 18.06.16 \nSupplement 4 to the 02 series of amendments Date of Entry into Force: 28.05.19 \n03 

In [128]:
def get_faiss_embeddings(doc_loaders):
    faiss_index_lis = []
    for doc_loader in doc_loaders:
        faiss_index_lis.append(FAISS.from_documents(doc_loader.load_and_split(), OpenAIEmbeddings()))
    return faiss_index_lis

In [130]:
# help(doc_loaders[0].load_and_split)

Help on method load_and_split in module langchain.document_loaders.base:

load_and_split(text_splitter: Optional[langchain.text_splitter.TextSplitter] = None) -> List[langchain.schema.document.Document] method of langchain.document_loaders.pdf.PyPDFLoader instance
    Load Documents and split into chunks. Chunks are returned as Documents.
    
    Args:
        text_splitter: TextSplitter instance to use for splitting documents.
          Defaults to RecursiveCharacterTextSplitter.
    
    Returns:
        List of Documents.



In [16]:
faiss_index_lis = get_faiss_embeddings(doc_loaders)

In [31]:
pages_1, pages_2 = [x.page_content for x in doc_loaders[0].load_and_split()], [x.page_content for x in doc_loaders[1].load_and_split()]

In [34]:
embeddings = OpenAIEmbeddings()
vectors_1, vectors_2 = embeddings.embed_documents(pages_1), embeddings.embed_documents(pages_2)

In [36]:
import numpy as np

arr_1, arr_2 = np.array(vectors_1), np.array(vectors_2)

In [45]:
len(arr_2)

85

In [62]:
res_index = np.argmax(np.dot(arr_1, arr_2.T), axis=1)

In [63]:
import pandas as pd
res = {
    'doc 1':  pages_1,
    'doc 2':  [pages_2[x] for x in res_index]
}

coherrent_pages = pd.DataFrame(res)


In [58]:
print(coherrent_pages.iloc[25, 0])
print(coherrent_pages.iloc[25, 1])

COUNTRY :E.C.E.  © InterRegs Ltd 2021 
 
ORIGINAL :UNITED NATIONS of April 11, 1997 
 
 
 
 
 TITLE: Approval of Vehicles  UN Regulation No. 100-03 
 with Electric Power Train  Jul/2021  PAGE: 25 
 6.6. External Short Circuit Protection 
 6.6.1.  The test shall be conducted in accordance with Annex 9F of this Regulation. 
 6.6.2. Acceptance Criteria;  
6.6.2.1. During the test t here shall be no evidence of: 
 
(a) Electrolyte leakage;  
(b) Rupture (applicable to h igh voltage REESS(s) only); 
 
(c) Venting (for REESS other than open-type traction battery);  
 
(d) Fire; 
 (e) Explosion. 
 
The evidence of electrolyte leakage shall be verified by visual ins pection without 
disassembling any part of the Tested-Device.  An appropriate technique shall, if necessary, 
be used in order to confirm if t here is any electrolyte leakage  from the REESS resulting from 
the test. The evidence of venti ng shall be verified by visual inspection without disassembling 
any part of the Tested-Device

In [132]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage

In [144]:
import asyncio
import nest_asyncio
import random

nest_asyncio.apply()
chat = ChatOpenAI(model='gpt-4')

results = []
async def async_get_compare_message(str1: str, str2: str, page_index:int, sleep_time: int=300):
    messages = [
        SystemMessage(
            content="You are a helpful assistant help the third party testing field workers to find out the key differences between two page from two almost same pdf file while different version. If you think this two sentence is mostly same(same in char or talk about mainly one thing), try to find out the key differences between them.  If not, Just return 【No】. Please always return in brief and concisely, your every sentence cost huge. Your Input will be 【page_content_1】 and 【page_content_2】"
        ),
        HumanMessage(
            content=f"【page_content_1】:\n{str1}\n【page_content_2】:\n{str2}"
        ),
        ]
    sleep_time = await asyncio.sleep(random.randint(0, 300))
    predict_messages = await chat.apredict_messages(messages)
    print(f'Successfully finished the original page_index for page {page_index} for doc 1.')
    results.append((str1, str2, predict_messages.content))
    return  predict_messages

In [143]:
import time

def get_compare_message(str1: str, str2: str, page_index:int):
    messages = [
        SystemMessage(
            content="You are a helpful assistant help the third party testing field workers to find out the key differences between two page from two almost same pdf file while different version. Your Input will be 【page_content_1】 and 【page_content_2】, They may If you think this two sentence is mostly same(same in char or talk about mainly one thing), try to find out the key differences between them.  If not, Just return 【No】. Please always return in brief and concisely, your every sentence cost huge. "
        ),
        HumanMessage(
            content=f"【page_content_1】:\n{str1}\n【page_content_2】:\n{str2}"
        ),
        ]
    predict_messages = chat.predict_messages(messages)
    print(f'Successfully finished the original page_index for page {page_index} for doc 1.')
    results.append((str1, str2, predict_messages.content))
    return  predict_messages

In [146]:
async def async_main(coherrent_pages:pd.DataFrame):
    tasks = []
    for i in range(len(coherrent_pages)):
        tasks.append(asyncio.create_task(async_get_compare_message(coherrent_pages.iloc[i, 0], coherrent_pages.iloc[i, 1], i)))
    await asyncio.gather(*tasks)

def main(coherrent_pages:pd.DataFrame):
    for i in range(len(coherrent_pages)):
        get_compare_message(coherrent_pages.iloc[i, 0], coherrent_pages.iloc[i, 1], i)

In [147]:
for i in range(0, len(coherrent_pages), 10):
    asyncio.run(async_main(coherrent_pages[i:i+10]))

Successfully finished the original page_index for page 1 for doc 1.
Successfully finished the original page_index for page 6 for doc 1.
Successfully finished the original page_index for page 8 for doc 1.
Successfully finished the original page_index for page 7 for doc 1.
Successfully finished the original page_index for page 2 for doc 1.
Successfully finished the original page_index for page 9 for doc 1.
Successfully finished the original page_index for page 3 for doc 1.
Successfully finished the original page_index for page 5 for doc 1.
Successfully finished the original page_index for page 0 for doc 1.
Successfully finished the original page_index for page 4 for doc 1.
Successfully finished the original page_index for page 4 for doc 1.
Successfully finished the original page_index for page 0 for doc 1.
Successfully finished the original page_index for page 6 for doc 1.
Successfully finished the original page_index for page 5 for doc 1.
Successfully finished the original page_index fo

Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIError: bad response status code 307 {"error":{"message":"bad response status code 307","type":"one_api_error","param":"307","code":"bad_response_status_code"}} 307 {'error': {'message': 'bad response status code 307', 'type': 'one_api_error', 'param': '307', 'code': 'bad_response_status_code'}} <CIMultiDictProxy('Server': 'nginx/1.18.0', 'Date': 'Wed, 13 Sep 2023 03:48:21 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '123', 'Connection': 'keep-alive')>.


Successfully finished the original page_index for page 2 for doc 1.
Successfully finished the original page_index for page 4 for doc 1.
Successfully finished the original page_index for page 0 for doc 1.
Successfully finished the original page_index for page 5 for doc 1.
Successfully finished the original page_index for page 6 for doc 1.
Successfully finished the original page_index for page 3 for doc 1.
Successfully finished the original page_index for page 9 for doc 1.
Successfully finished the original page_index for page 8 for doc 1.
Successfully finished the original page_index for page 1 for doc 1.
Successfully finished the original page_index for page 0 for doc 1.
Successfully finished the original page_index for page 9 for doc 1.
Successfully finished the original page_index for page 6 for doc 1.
Successfully finished the original page_index for page 7 for doc 1.
Successfully finished the original page_index for page 8 for doc 1.
Successfully finished the original page_index fo

In [148]:
results

[('COUNTRY :E.C.E.  © InterRegs Ltd 2021 \n \nORIGINAL :UNITED NATIONS of April 11, 1997 \n E/ECE/324 ) Rev.2/Add.99/Rev.2/Amend. 5\nE/ECE/TRANS/505 )  \nJuly 2, 2021  \n \n  \n \n \n  \n \n  \nSTATUS OF UNITED NATIONS REGULATION \n  \nECE 100-0 3 \n  \nUNIFORM PROVISIONS CONCERNING THE APPROVAL OF: \n \n \nVEHICLES WITH REGARD TO SPECIFIC \nREQUIREMENTS FOR THE ELECTRIC POWER TRAIN \n \n Incorporating: \n Supplement 1 to the 01 series of amendments Date of Entry into Force: 26.07.12 \nSupplement 2 to the 01 series of amendments Date of Entry into Force: 15.07.13 \n02 series of amendments to the Regu lation Date of Entry into Force: 15.07.13 \nSupplement 1 to the 02 series of amendments Date of Entry into Force: 10.06.14 \nSupplement 2 to the 02 series of amendments Date of Entry into Force: 29.01.16 \nSupplement 3 to the 02 series of amendments Date of Entry into Force: 18.06.16 \nSupplement 4 to the 02 series of amendments Date of Entry into Force: 28.05.19 \n03 series of amendments 

In [103]:
# asyncio.run(main(coherrent_pages))
main(coherrent_pages)

Successfully finished the original page_index for page 0 for doc 1.
Successfully finished the original page_index for page 1 for doc 1.
Successfully finished the original page_index for page 2 for doc 1.
Successfully finished the original page_index for page 3 for doc 1.
Successfully finished the original page_index for page 4 for doc 1.
Successfully finished the original page_index for page 5 for doc 1.
Successfully finished the original page_index for page 6 for doc 1.
Successfully finished the original page_index for page 7 for doc 1.
Successfully finished the original page_index for page 8 for doc 1.
Successfully finished the original page_index for page 9 for doc 1.
Successfully finished the original page_index for page 10 for doc 1.
Successfully finished the original page_index for page 11 for doc 1.
Successfully finished the original page_index for page 12 for doc 1.
Successfully finished the original page_index for page 13 for doc 1.
Successfully finished the original page_inde

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIError: Post "http://193.221.95.28:4000/v1/chat/completions": EOF {"error":{"message":"Post \"http://193.221.95.28:4000/v1/chat/completions\": EOF","type":"one_api_error","param":"","code":"do_request_failed"}} 500 {'error': {'message': 'Post "http://193.221.95.28:4000/v1/chat/completions": EOF', 'type': 'one_api_error', 'param': '', 'code': 'do_request_failed'}} {'Server': 'nginx/1.18.0', 'Date': 'Tue, 12 Sep 2023 11:49:23 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '144', 'Connection': 'keep-alive'}.
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised APIError: Post "http://193.221.95.28:4000/v1/chat/completions": dial tcp 193.221.95.28:4000: connect: connection refused {"error":{"message":"Post \"http://193.221.95.28:4000/v1/chat/completions\":

Successfully finished the original page_index for page 27 for doc 1.
Successfully finished the original page_index for page 28 for doc 1.
Successfully finished the original page_index for page 29 for doc 1.
Successfully finished the original page_index for page 30 for doc 1.
Successfully finished the original page_index for page 31 for doc 1.
Successfully finished the original page_index for page 32 for doc 1.
Successfully finished the original page_index for page 33 for doc 1.
Successfully finished the original page_index for page 34 for doc 1.
Successfully finished the original page_index for page 35 for doc 1.
Successfully finished the original page_index for page 36 for doc 1.


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


Successfully finished the original page_index for page 37 for doc 1.
Successfully finished the original page_index for page 38 for doc 1.
Successfully finished the original page_index for page 39 for doc 1.
Successfully finished the original page_index for page 40 for doc 1.
Successfully finished the original page_index for page 41 for doc 1.
Successfully finished the original page_index for page 42 for doc 1.
Successfully finished the original page_index for page 43 for doc 1.


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIError: Post "http://193.221.95.28:4000/v1/chat/completions": EOF {"error":{"message":"Post \"http://193.221.95.28:4000/v1/chat/completions\": EOF","type":"one_api_error","param":"","code":"do_request_failed"}} 500 {'error': {'message': 'Post "http://193.221.95.28:4000/v1/chat/completions": EOF', 'type': 'one_api_error', 'param': '', 'code': 'do_request_failed'}} {'Server': 'nginx/1.18.0', 'Date': 'Tue, 12 Sep 2023 11:57:10 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '144', 'Connection': 'keep-alive'}.
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised APIError: bad response status code 307 {"error":{"message":"bad response status code 307","type":"one_api_error","param":"307","code":"bad_response_status_code"}} 307 {'error': {'message': 'bad res

Successfully finished the original page_index for page 44 for doc 1.
Successfully finished the original page_index for page 45 for doc 1.
Successfully finished the original page_index for page 46 for doc 1.
Successfully finished the original page_index for page 47 for doc 1.
Successfully finished the original page_index for page 48 for doc 1.
Successfully finished the original page_index for page 49 for doc 1.
Successfully finished the original page_index for page 50 for doc 1.
Successfully finished the original page_index for page 51 for doc 1.
Successfully finished the original page_index for page 52 for doc 1.
Successfully finished the original page_index for page 53 for doc 1.
Successfully finished the original page_index for page 54 for doc 1.
Successfully finished the original page_index for page 55 for doc 1.
Successfully finished the original page_index for page 56 for doc 1.
Successfully finished the original page_index for page 57 for doc 1.
Successfully finished the original

In [104]:
results

[('COUNTRY :E.C.E.  © InterRegs Ltd 2021 \n \nORIGINAL :UNITED NATIONS of April 11, 1997 \n E/ECE/324 ) Rev.2/Add.99/Rev.2/Amend. 5\nE/ECE/TRANS/505 )  \nJuly 2, 2021  \n \n  \n \n \n  \n \n  \nSTATUS OF UNITED NATIONS REGULATION \n  \nECE 100-0 3 \n  \nUNIFORM PROVISIONS CONCERNING THE APPROVAL OF: \n \n \nVEHICLES WITH REGARD TO SPECIFIC \nREQUIREMENTS FOR THE ELECTRIC POWER TRAIN \n \n Incorporating: \n Supplement 1 to the 01 series of amendments Date of Entry into Force: 26.07.12 \nSupplement 2 to the 01 series of amendments Date of Entry into Force: 15.07.13 \n02 series of amendments to the Regu lation Date of Entry into Force: 15.07.13 \nSupplement 1 to the 02 series of amendments Date of Entry into Force: 10.06.14 \nSupplement 2 to the 02 series of amendments Date of Entry into Force: 29.01.16 \nSupplement 3 to the 02 series of amendments Date of Entry into Force: 18.06.16 \nSupplement 4 to the 02 series of amendments Date of Entry into Force: 28.05.19 \n03 series of amendments 

In [192]:
with open('./res.md', 'w') as f:
    for ind, (str1, str2, res) in enumerate(results):
        f.write(f'## 第 {ind + 1} 条样例\n\n')
        f.write(f'### 第 {ind + 1} 条 ECE-100 03 原文\n\n')
        f.write(str1 + '\n\n')
        f.write(f'### 第 {ind + 1} 条 ECE-100 02 Amend 4匹配原文\n\n')
        f.write(str2 + '\n\n')
        f.write(f'### 第 {ind + 1} 条 AI 差异分析\n\n')        
        f.write(res + '\n\n')

In [164]:
pages1_col = coherrent_pages.iloc[..., 0].to_list()
pages2_col = [x.page_content for x in doc_loaders[1].load_and_split()]
# doc_loaders[1].load_and_split()

In [166]:
res_1_col = [pages1_col.index(item) for item in [x[0] for x in results]]
res_2_col = [pages2_col.index(item) for item in [x[1] for x in results]]

In [171]:
col_item = {ind: (col1, col2, diff) for ind, (col1, col2, diff) in enumerate(zip(res_1_col, res_2_col, [x[2] for x in results]))}

In [173]:
import json
json.dump(col_item, open('../data/res_show_in_ui.json', 'w'), ensure_ascii=False, indent=4)

In [151]:
pd.DataFrame(results)

Unnamed: 0,0,1,2
0,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,1. The copyright year is different: 2021 in pa...
1,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,1. The copyright year is different: 2021 in pa...
2,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,1. Different versions: 2021 vs 2019. \n2. Diff...
3,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,The key differences between the two pages are:...
4,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,1. The versions and dates of the documents are...
...,...,...,...
102,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,1. The document versions are different: 2021 v...
103,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,【page_content_1】:\n- Focuses on overcurrent du...
104,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,1. The document version and date: 100-03 Jul/2...
105,COUNTRY :E.C.E. © InterRegs Ltd 2021 \n \nORI...,COUNTRY :E.C.E. © InterRegs Ltd 2019 \n \nORI...,The key differences are:\n\n1. Document versio...


In [197]:
import json

res_show_in_ui = json.load(open('../show_data/res_show_in_ui.json', 'r'))

In [199]:
new_res = {str(item[0]): item for item in res_show_in_ui.values()}

In [202]:
json.dump(new_res, open('../show_data/res_show_in_ui.json', 'w'), indent = 4)

In [194]:
from pdf2image import convert_from_path
from PIL import Image

doc_images_1 = convert_from_path('../show_data/ECE-100_03.pdf')
doc_images_2 = convert_from_path('../show_data/ECE-100_02 Amend 4.pdf')

In [195]:
for ind, image in enumerate(doc_images_1):
    image.save(f'../show_data/ECE-100_03/page_{ind + 1}.png', 'png')
    
for ind, image in enumerate(doc_images_2):
    image.save(f'../show_data/ECE-100_02/page_{ind + 1}.png', 'png')