In [2]:
import json
import os
import re
os.chdir('../')


In [3]:
json_file = 'data/Dupin__V1_2_782756e5-c517-4dab-a954-a0e4a878beb4.json'
with open(json_file, 'r') as file:
    data = json.load(file)

In [4]:
from helpers import process_json

In [5]:
full_text = process_json(data)

In [6]:
with open('data/dupin_full_text.txt', 'w') as file:
    file.write(full_text)


In [7]:


def extract_headers_and_content(text):
    # Split the text into lines
    lines = text.strip().split('\n')
    headers = []
    content = {}
    current_header = None
    current_content = []

    for line in lines:
        line = line.strip()

        if not line:
            continue  # Skip empty lines

        # Try to match header pattern 1: Lines that start with uppercase letters (headers may be in uppercase)
        match1 = re.match(r'^([A-Z ]{2,})(.*)$', line)
        # Try to match header pattern 2: Lines that start with numbers and periods
        match2 = re.match(r'^(\d+(\.\d+)*\s+)(.*)$', line)

        if match1:
            header = match1.group(1).strip()
            rest = match1.group(2).strip()

            if current_header:
                # Store the previous header and its content
                content[current_header] = '\n'.join(current_content).strip()
                headers.append(current_header)

            current_header = header
            current_content = []
            if rest:
                current_content.append(rest)

        elif match2:
            header = line.strip()
            if current_header:
                # Store the previous header and its content
                content[current_header] = '\n'.join(current_content).strip()
                headers.append(current_header)

            current_header = header
            current_content = []
        else:
            if current_header:
                current_content.append(line)

    # Store the last header and its content
    if current_header and current_content:
        content[current_header] = '\n'.join(current_content).strip()
        headers.append(current_header)
    elif current_header and current_header not in content:
        content[current_header] = ''
        headers.append(current_header)

    return content

In [8]:
extract_headers_and_content(full_text)

{'ABSTRACT D': 'etecting fraudulent activities in financial and e-commerce transac-\ntion networks is crucial, and one effective method for this is Densest Subgraph Discovery (DSD). However, deploying DSD methods in production systems faces substantial scalability challenges due to the predominantly sequential nature of existing methods, which impedes their ability to handle large-scale transaction networks and results in significant detection delays. In this paper, we introduce Dupin, a novel parallel processing framework designed for efficient DSD processing in billion-scale graphs. Dupin is powered by a processing engine that exploits the unique properties of the peeling process, with theoretical guarantees on detection quality and efficiency.Dupin provides user-friendly APIs for flexible customization of DSD objectives and ensures robust adaptability to diverse fraud detection scenarios. Empirical evaluations indicate that Dupin outperforms several existing DSD methods, with perfor

In [11]:
def parse_document(text):
    # Initialize variables
    sections = {}
    current_section = None
    current_content = []

    # Split text into lines
    lines = text.split('\n')

    # Define heading patterns
    numbered_heading_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+(.*)$')
    # Known headings
    known_headings = {
        "Abstract",
        "Title",
        "Introduction",
        "Background",
        "Related Work",
        "Evaluation",
        "Figure",
        "Table",
        "Motivation",
        "Technical Section",
        "Conclusion",
        "References",
        "Appendix",
        # "Format",
        # "Acknowledgements",
        # "Methodology",
        # "Results",
        # "Discussion",
    }
    known_headings_lower = [h.lower() for h in known_headings]

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check for numbered heading
        m = numbered_heading_pattern.match(line)
        if m:
            # Save current section
            if current_section is not None:
                content = ' '.join(current_content).strip()
                if content:
                    sections[current_section] = content
            # Start new section
            section_number = m.group(1)
            section_title = m.group(2).strip()
            current_section = f"{section_number} {section_title}"
            current_content = []
            continue

        # Check for unnumbered heading (known headings)
        if line.lower() in known_headings_lower:
            # Save current section
            if current_section is not None:
                content = ' '.join(current_content).strip()
                if content:
                    sections[current_section] = content
            # Start new section
            current_section = line.strip()
            current_content = []
            continue

        # Append line to current content
        if current_section is None:
            # If no current section, start with 'Abstract'
            current_section = 'Abstract'
        current_content.append(line)

    # Save the last section
    if current_section is not None:
        content = ' '.join(current_content).strip()
        if content:
            sections[current_section] = content
    return sections

In [12]:
parse_document(full_text)

{'Abstract': 'ABSTRACT Detecting fraudulent activities in financial and e-commerce transac- tion networks is crucial, and one effective method for this is Densest Subgraph Discovery (DSD). However, deploying DSD methods in production systems faces substantial scalability challenges due to the predominantly sequential nature of existing methods, which impedes their ability to handle large-scale transaction networks and results in significant detection delays. In this paper, we introduce Dupin, a novel parallel processing framework designed for efficient DSD processing in billion-scale graphs. Dupin is powered by a processing engine that exploits the unique properties of the peeling process, with theoretical guarantees on detection quality and efficiency.Dupin provides user-friendly APIs for flexible customization of DSD objectives and ensures robust adaptability to diverse fraud detection scenarios. Empirical evaluations indicate that Dupin outperforms several existing DSD methods, with

In [14]:
def parse_document(text):
    # Initialize variables
    sections = {}
    current_section = None
    current_content = []

    # Split text into lines
    lines = text.split('\n')

    # Define heading patterns
    numbered_heading_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+(.*)$')
    # Known headings
    known_headings = {'Abstract', 'Acknowledgements', 'References', 'Conclusion', 'Introduction', 'Background', 'Related Work', 'Methodology', 'Results', 'Discussion', 'Appendix'}
    known_headings_lower = [h.lower() for h in known_headings]

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check for numbered heading
        m = numbered_heading_pattern.match(line)
        if m:
            # Save current section
            if current_section is not None:
                content = ' '.join(current_content).strip()
                # Save the section even if content is empty
                sections[current_section] = content
            # Start new section
            section_number = m.group(1)
            section_title = m.group(2).strip()
            current_section = f"{section_number} {section_title}"
            current_content = []
            continue

        # Check for unnumbered heading (known headings)
        if line.lower() in known_headings_lower:
            # Save current section
            if current_section is not None:
                content = ' '.join(current_content).strip()
                sections[current_section] = content
            # Start new section
            current_section = line.strip()
            current_content = []
            continue

        # Append line to current content
        if current_section is None:
            # If no current section, start with 'Abstract'
            current_section = 'Abstract'
        current_content.append(line)

    # Save the last section
    if current_section is not None:
        content = ' '.join(current_content).strip()
        sections[current_section] = content
    return sections


In [16]:
with open('data/dupin_full_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [19]:
parse_document(full_text)

{'Abstract': 'ABSTRACT Detecting fraudulent activities in financial and e-commerce transac- tion networks is crucial, and one effective method for this is Densest Subgraph Discovery (DSD). However, deploying DSD methods in production systems faces substantial scalability challenges due to the predominantly sequential nature of existing methods, which impedes their ability to handle large-scale transaction networks and results in significant detection delays. In this paper, we introduce Dupin, a novel parallel processing framework designed for efficient DSD processing in billion-scale graphs. Dupin is powered by a processing engine that exploits the unique properties of the peeling process, with theoretical guarantees on detection quality and efficiency.Dupin provides user-friendly APIs for flexible customization of DSD objectives and ensures robust adaptability to diverse fraud detection scenarios. Empirical evaluations indicate that Dupin outperforms several existing DSD methods, with

In [38]:
import re

def parse_document(text):
    # Initialize variables
    sections = {}
    current_main_section = None
    current_content = []

    # Split text into lines
    lines = text.split('\n')

    # Define heading patterns
    main_heading_pattern = re.compile(r'^(\d+)\s+(.*)$')  # Matches headings like '1 INTRODUCTION'
    sub_heading_pattern = re.compile(r'^(\d+\.\d+)\s+(.*)$')  # Matches headings like '2.1 Preliminary'
    # Known headings (case-insensitive)
    known_headings = {'Abstract', 'Acknowledgements', 'References', 'Conclusion', 'Introduction', 'Background', 'Related Work', 'Methodology', 'Results', 'Discussion', 'Appendix'}
    known_headings_lower = [h.lower() for h in known_headings]

    for idx, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check for main numbered heading
        m_main = main_heading_pattern.match(line)
        if m_main and '.' not in m_main.group(1):
            # Save current section
            if current_main_section is not None:
                content = ' '.join(current_content).strip()
                sections[current_main_section] = content
            # Start new main section
            section_number = m_main.group(1)
            section_title = m_main.group(2).strip()
            current_main_section = f"{section_number} {section_title}"
            current_content = []
            continue

        # Check for unnumbered heading (known headings)
        if line.lower() in known_headings_lower:
            # Save current section
            if current_main_section is not None:
                content = ' '.join(current_content).strip()
                sections[current_main_section] = content
            # Start new main section
            current_main_section = line.strip()
            current_content = []
            continue

        # For subheadings and other content
        # We include subheadings in the content of the current main section
        # So we can check for subheadings and append them to content
        m_sub = sub_heading_pattern.match(line)
        if m_sub:
            # It's a subheading
            sub_section_number = m_sub.group(1)
            sub_section_title = m_sub.group(2).strip()
            current_content.append(f"{sub_section_number} {sub_section_title}")
            continue

        # Append line to current content
        if current_main_section is None:
            # If no current section, start with 'Abstract'
            current_main_section = 'Abstract'
        current_content.append(line)

    # Save the last section
    if current_main_section is not None:
        content = ' '.join(current_content).strip()
        sections[current_main_section] = content
    return sections

In [32]:
full_temp_text = re.sub(r'\n+', ' ', full_text)

In [39]:
parse_document(full_text)

{'Abstract': 'ABSTRACT Detecting fraudulent activities in financial and e-commerce transac- tion networks is crucial, and one effective method for this is Densest Subgraph Discovery (DSD). However, deploying DSD methods in production systems faces substantial scalability challenges due to the predominantly sequential nature of existing methods, which impedes their ability to handle large-scale transaction networks and results in significant detection delays. In this paper, we introduce Dupin, a novel parallel processing framework designed for efficient DSD processing in billion-scale graphs. Dupin is powered by a processing engine that exploits the unique properties of the peeling process, with theoretical guarantees on detection quality and efficiency.Dupin provides user-friendly APIs for flexible customization of DSD objectives and ensures robust adaptability to diverse fraud detection scenarios. Empirical evaluations indicate that Dupin outperforms several existing DSD methods, with

In [41]:
parse_document(full_text)

{'Abstract': 'ABSTRACT Detecting fraudulent activities in financial and e-commerce transac- tion networks is crucial, and one effective method for this is Densest Subgraph Discovery (DSD). However, deploying DSD methods in production systems faces substantial scalability challenges due to the predominantly sequential nature of existing methods, which impedes their ability to handle large-scale transaction networks and results in significant detection delays. In this paper, we introduce Dupin, a novel parallel processing framework designed for efficient DSD processing in billion-scale graphs. Dupin is powered by a processing engine that exploits the unique properties of the peeling process, with theoretical guarantees on detection quality and efficiency.Dupin provides user-friendly APIs for flexible customization of DSD objectives and ensures robust adaptability to diverse fraud detection scenarios. Empirical evaluations indicate that Dupin outperforms several existing DSD methods, with

In [45]:
import re

def parse_document(text):
    # Initialize variables
    sections = {}
    current_main_section = None
    current_content = []

    # Split text into lines
    lines = text.split('\n')

    # Combine lines to handle headings that span multiple lines
    combined_lines = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if not line:
            i += 1
            continue  # Skip empty lines
        # Check if this line is a number and the next line is possibly a heading
        if re.match(r'^\d+$', line) and i+1 < len(lines):
            next_line = lines[i+1].strip()
            # Assume that if the next line is not empty, it's a heading
            if next_line:
                combined_line = f"{line} {next_line}"
                combined_lines.append(combined_line)
                i += 2  # Skip the next line as it's part of the heading
            else:
                combined_lines.append(line)
                i += 1
        else:
            combined_lines.append(line)
            i += 1

    # Define heading patterns
    main_heading_pattern = re.compile(r'^(\d+)\s+(.*)$')  # Matches headings like '1 INTRODUCTION'
    sub_heading_pattern = re.compile(r'^(\d+\.\d+)\s+(.*)$')  # Matches headings like '2.1 Preliminary'
    # Known headings (case-insensitive)
    known_headings = {'Abstract', 'Acknowledgements', 'References', 'Conclusion', 'Introduction', 'Background', 'Related Work', 'Methodology', 'Results', 'Discussion', 'Appendix'}
    known_headings_lower = [h.lower() for h in known_headings]

    for idx, line in enumerate(combined_lines):
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check for main numbered heading
        m_main = main_heading_pattern.match(line)
        if m_main and '.' not in m_main.group(1):
            # Save current section
            if current_main_section is not None:
                content = ' '.join(current_content).strip()
                sections[current_main_section] = content
            # Start new main section
            section_number = m_main.group(1)
            section_title = m_main.group(2).strip()
            current_main_section = f"{section_number} {section_title}"
            current_content = []
            continue

        # Check for unnumbered heading (known headings)
        if line.lower() in known_headings_lower:
            # Save current section
            if current_main_section is not None:
                content = ' '.join(current_content).strip()
                sections[current_main_section] = content
            # Start new main section
            current_main_section = line.strip()
            current_content = []
            continue

        # For subheadings and other content
        # We include subheadings in the content of the current main section
        # So we can check for subheadings and append them to content
        m_sub = sub_heading_pattern.match(line)
        if m_sub:
            # It's a subheading
            sub_section_number = m_sub.group(1)
            sub_section_title = m_sub.group(2).strip()
            current_content.append(f"{sub_section_number} {sub_section_title}")
            continue

        # Append line to current content
        if current_main_section is None:
            # If no current section, start with 'Abstract'
            current_main_section = 'Abstract'
        current_content.append(line)

    # Save the last section
    if current_main_section is not None:
        content = ' '.join(current_content).strip()
        sections[current_main_section] = content
    return sections


In [46]:
parse_document(full_text)

{'Abstract': 'ABSTRACT Detecting fraudulent activities in financial and e-commerce transac- tion networks is crucial, and one effective method for this is Densest Subgraph Discovery (DSD). However, deploying DSD methods in production systems faces substantial scalability challenges due to the predominantly sequential nature of existing methods, which impedes their ability to handle large-scale transaction networks and results in significant detection delays. In this paper, we introduce Dupin, a novel parallel processing framework designed for efficient DSD processing in billion-scale graphs. Dupin is powered by a processing engine that exploits the unique properties of the peeling process, with theoretical guarantees on detection quality and efficiency.Dupin provides user-friendly APIs for flexible customization of DSD objectives and ensures robust adaptability to diverse fraud detection scenarios. Empirical evaluations indicate that Dupin outperforms several existing DSD methods, with

In [50]:
import re

import re

def parse_document(text):
    # Preprocess the text to ensure consistent heading formatting
    # Replace any newline followed by whitespace(s) and digit(s) with a newline and the digits
    text = re.sub(r'\n\s*(\d+)', r'\n\1', text)

    # Replace any occurrence where a number is not at the start of a line
    text = re.sub(r'(?<!\n)(\d+\s+[A-Z].*)', r'\n\1', text)

    # Split text into lines
    lines = text.split('\n')

    sections = {}
    current_section = None
    current_content = []

    # Define patterns for main headings and known headings
    main_heading_pattern = re.compile(r'^(\d+(?:\.\d+)*)\s+(.*)$')
    known_headings = {'Abstract', 'Acknowledgements', 'References', 'Conclusion', 'Introduction', 'Background',
                      'Related Work', 'Methodology', 'Results', 'Discussion', 'Appendix'}
    known_headings_lower = {h.lower() for h in known_headings}

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check if the line is a known heading
        if line.lower() in known_headings_lower:
            # Save current section
            if current_section is not None:
                content = ' '.join(current_content).strip()
                sections[current_section] = content
            # Start new section
            current_section = line
            current_content = []
            continue

        # Check if the line is a numbered heading
        m = main_heading_pattern.match(line)
        if m:
            # Save current section
            if current_section is not None:
                content = ' '.join(current_content).strip()
                sections[current_section] = content
            # Start new section
            section_number = m.group(1)
            section_title = m.group(2)
            current_section = f"{section_number} {section_title}"
            current_content = []
            continue

        # Otherwise, it's content
        if current_section is None:
            # If no section yet, start with 'Abstract'
            current_section = 'Abstract'
        current_content.append(line)

    # Save the last section
    if current_section is not None and current_content:
        content = ' '.join(current_content).strip()
        sections[current_section] = content

    return sections

In [51]:
parse_document(full_text)

{'Abstract': 'ABSTRACT Detecting fraudulent activities in financial and e-commerce transac- tion networks is crucial, and one effective method for this is Densest Subgraph Discovery (DSD). However, deploying DSD methods in production systems faces substantial scalability challenges due to the predominantly sequential nature of existing methods, which impedes their ability to handle large-scale transaction networks and results in significant detection delays. In this paper, we introduce Dupin, a novel parallel processing framework designed for efficient DSD processing in billion-scale graphs. Dupin is powered by a processing engine that exploits the unique properties of the peeling process, with theoretical guarantees on detection quality and efficiency.Dupin provides user-friendly APIs for flexible customization of DSD objectives and ensures robust adaptability to diverse fraud detection scenarios. Empirical evaluations indicate that Dupin outperforms several existing DSD methods, with