In [1]:
#
# Using file doc_id_len_sorted.json, each document is taken, the content of the current document is read, 
# and the mentions belonging to it are extracted from the database. 
#
# The content of the document is separated into sentences using spacy.nlp, and for each mention, the sentence it belongs to is selected.
# At the same time, the indices from which each mention starts in the identified sentence are calculated.
#
# Save the generated DataFrame in gpt_dataset_<no.>.json
#

import mysql.connector as mysql
import pandas as pd
import spacy
import json
import numpy as np
import warnings

def main():
    # Establish a single connection to the database
    mydb = mysql.connect(
        host="127.0.0.1",
        user="root",
        passwd="xxxxx",
        database="preprocessed"
    )

    nlp = spacy.load("ro_core_news_lg")
    nlp.max_length = 1500000

    with open('doc_id_len_sorted.json', 'r') as file:
        loaded_list_file = json.load(file)

    # Running in batches
    filtered_dict = {k: v for k, v in loaded_list_file.items() if v <= 1000000}
    # 1st run: sorted_filtered_items = sorted(filtered_dict.items(), key=lambda item: item[1])[:10000]
    # 2nd run: sorted_filtered_items = sorted(filtered_dict.items(), key=lambda item: item[1])[10000:20000]
    # 3rd run: sorted_filtered_items = sorted(filtered_dict.items(), key=lambda item: item[1])[20000:30000]
    # 4th run: sorted_filtered_items = sorted(filtered_dict.items(), key=lambda item: item[1])[30000:40000]
    # 5th run: sorted_filtered_items = sorted(filtered_dict.items(), key=lambda item: item[1])[40000:50000]
    sorted_filtered_items = sorted(filtered_dict.items(), key=lambda item: item[1])[50000:60000]
    doc2process = dict(sorted_filtered_items)
    
    for doc_id, doc_len in doc2process.items():
        # Document with the doc_id
        query = f"SELECT * FROM unique_document WHERE doc_id = {doc_id}"
        df_document = pd.read_sql_query(query, con=mydb);

        if df_document.empty:
            print(f"Doc_id: {doc_id} is empty")
            continue

        # Mentions from doc_id document
        query = f"SELECT * FROM unique_mention WHERE doc_id = {doc_id}"
        df_mention = pd.read_sql_query(query, con=mydb);

        if df_mention.empty:
            print(f"Mentions from doc_id: {doc_id} document are empty")
            continue

        # Read file with the content of the document
        with open(df_document.at[0, 'doc_content'], 'r', encoding='utf-8') as file:
            doc_content = file.read()

        # Tokenize document content
        doc_nlp = nlp(doc_content)

        sent_text_list = []
        sent_start_list = []
        sent_end_list = []

        sent_start = 0
        for sent in doc_nlp.sents:
            sent_len = len(sent.text) + 1
            sent_end = sent_start + sent_len - 1

            sent_text_list.append(sent.text)
            sent_start_list.append(sent_start)
            sent_end_list.append(sent_end)

            sent_start = sent_end + 1

        # Determine sentence boundaries for each mention
        mention_ends = df_mention['men_end'].values
        mention_starts = df_mention['men_start'].values

        end_interval = [next(i for i in sent_end_list if i >= end_date) for end_date in mention_ends]
        start_interval = [next(i for i in reversed(sent_start_list) if i <= start_date) for start_date in mention_starts]

        doc_context_list = [
            ' '.join(sent_text_list[sent_start_list.index(start):sent_end_list.index(end) + 1])
            for start, end in zip(start_interval, end_interval)
        ]
        
        df_mention['men_start_in_phrase'] = np.array(mention_starts) - np.array(start_interval)
        df_mention['phrase'] = doc_context_list
        df_mention['doc_title'] = df_document.at[0, 'doc_title']
        df_mention['doc_url'] = df_document.at[0, 'doc_url']
        df_mention['doc_content'] = df_document.at[0, 'doc_content']

        df_mention = df_mention.rename(columns={'men_start': 'men_start_in_doc', 'men_end': 'men_end_in_doc'})

        # Convert DataFrame to JSON and write to file
        json_str = df_mention.to_json(orient='records', lines=True, force_ascii=False)
        with open(r'C:\Users\xxxxx\Desktop\interconnection_legislative_documents\01_phrase_selection\phrasing001\gpt_dataset_6.json', 'a', encoding='utf-8') as file:
            file.write(json_str)

        del df_mention
        del df_document

if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    main()


{'31855': 148, '18361': 226, '24784': 227, '26803': 234, '12778': 241, '24396': 242, '7403': 245, '24778': 249, '27924': 250, '34309': 252}