In [79]:
import os
from PyPDF2 import PdfReader, PdfWriter

def merge_pdfs_in_directory(directory, output):
    pdf_writer = PdfWriter()
    # List all files in the given directory
    files = os.listdir(directory)
    # Filter for PDF files
    pdf_files = [f for f in files if f.endswith('.pdf')]
    # Sort files to maintain any necessary order
    pdf_files.sort()

    # Iterate over sorted PDF files and add each page to the writer
    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory, pdf_file)
        pdf_reader = PdfReader(pdf_path)
        for page in range(len(pdf_reader.pages)):
            pdf_writer.add_page(pdf_reader.pages[page])

    # Write out the merged PDF
    with open(output, 'wb') as out:
        pdf_writer.write(out)

if __name__ == '__main__':
    # Directory containing PDF files
    directory = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia'
    # Output file name
    output = 'Virginia.pdf'
    merge_pdfs_in_directory(directory, output)


KeyboardInterrupt: 

In [13]:
import pymupdf4llm
import pathlib
import pandas as pd

def process_pdf(pdf_path):
    # Convert PDF to Markdown with page chunks and table extraction
    result = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
    
    full_text = ""
    
    for page_num, page in enumerate(result, start=1):
        # Add page content
        full_text += f"Page {page_num}:\n\n{page['text']}\n\n"
        
        # Process tables
        if 'tables' in page and page['tables']:
            full_text += f"Tables on page {page_num}:\n\n"
            for table in page['tables']:
                # Print table structure for debugging
                print(f"Table structure on page {page_num}:")
                print(table.keys())
                
                try:
                    # Try to access 'content' and 'header' keys
                    if 'content' in table and 'header' in table:
                        df = pd.DataFrame(table['content'], columns=table['header'])
                    elif 'cells' in table:
                        # If 'cells' key is present, assume it contains all data including headers
                        df = pd.DataFrame(table['cells'])
                    else:
                        # If neither structure is found, create a simple string representation
                        df = pd.DataFrame(table)
                    
                    # Convert DataFrame to markdown
                    table_md = df.to_markdown(index=False)
                    full_text += table_md + "\n\n"
                except Exception as e:
                    full_text += f"Error processing table: {str(e)}\n\n"
                    print(f"Error processing table on page {page_num}: {str(e)}")
    
    # Optionally, save the full text to a file
    output_path = pathlib.Path(pdf_path).with_suffix('.md')
    output_path.write_text(full_text, encoding='utf-8')
    
    return full_text

# Process all PDF files in a directory
pdf_directory = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia'
for pdf_file in pathlib.Path(pdf_directory).glob("*.pdf"):
    processed_text = process_pdf(str(pdf_file))
    print(f"Processed: {pdf_file}")
    print("Sample of processed text:")
    #print(processed_text[:500])  # Print first 1000 characters as a sample

Processing D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_10_Subsection_1001_Section 1001 Administration.pdf...
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_10_Subsection_1001_Section 1001 Administration.pdf
Sample of processed text:
Processing D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1101_Section 1101 General.pdf...
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1101_Section 1101 General.pdf
Sample of processed text:
Processing D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1102_Section 1102 Compliance.pdf...
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1102_Section 1102 Compliance.pdf
Sample of processed text:
Processing D:\G_sync\Study\SE

In [202]:
import os
import re
import json
import pathlib
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
phrases_to_remove = [
    "# # copyright 2024 international code council , inc. , licensors ( rights reserved ) . accessed venkatesh shanmugam",
    "pursuant license agreement icc . reproduction distribution authorized .",
    "unauthorized reproduction distribution violation federal copyright , subject civil -- -- -"
]

# Define stopwords that should be retained for context-specific reasons
retained_stopwords = {
    'shall', 'must', 'may', 'should', 'section', 'chapter', 'article', 'table',
    'inch', 'inches', 'feet', 'foot', 'mm', 'psi', 'mpa'
}

# Get the default set of English stopwords and remove the retained ones
default_stopwords = set(stopwords.words('english'))
custom_stopwords = default_stopwords.difference(retained_stopwords)

# Define the copyright pattern with improved flexibility
COPYRIGHT_PATTERN = re.compile(
    r'##\s+copyright\s+\d{4}\s+international\s+code\s+council\s*,\s*'
    r'inc\.\s*,\s*licensors\s*\([^)]*\)\s*\.\s*accessed\s+.*?\d{1,2}/\d{1,2}/\d{4}\s+'
    r'pursuant\s+to\s+license\s+agreement\s+with\s+icc\s*\.\s*reproduction\s+and\s+'
    r'distribution\s+authorized\s*\.\s*unauthorized\s+reproduction\s+and\s+'
    r'distribution\s+is\s+a\s+violation\s+of\s+federal\s+copyright\s*,\s*subject\s+to\s+'
    r'civil\s+and\s+criminal\s+penalties\s*\.*\s*-+',
    re.DOTALL | re.IGNORECASE
)


# Define patterns for removing hyperlinks
LINK_PATTERNS = [
    r'\[([^\]]+)\]\(http[s]?://[^\)]+\)',  # Convert markdown links to text
    r'\(http[s]?://[^\)]+\)',              # Remove plain URL in parenthesis
    r'http[s]?://\S+',                     # Remove standalone URLs
    r'\[http[s]?://[^\]]+\]',              # Remove markdown URLs without display text
    r'_Accessed by.*?thereunder_',         # Remove access information
    r'_+'                                  # Remove extra underscores
]

# Define a pattern for extracting references based on certain keywords
REFERENCE_PATTERN = r'(see|accordance with|comply with|specified in|determined by|defined in|subject to|listed in)\s+((?:Section|Chapter|Table|Figure)\s+[\d\.]+(?:\s*(?:through|to)\s*[\d\.]+)?|\b[A-Z]+\s+[\d\.]+(?:\s*(?:through|to)\s*[\d\.]+)?)'

# Function to remove special character encodings
def clean_special_characters(text):
    text = text.encode('utf-8').decode('unicode_escape')
    text = re.sub(r'[^\x20-\x7E]+', '', text)  # Keeps standard ASCII characters
    return text

# Function to remove copyright and unnecessary content
def remove_copyright_and_links(content):
    content = COPYRIGHT_PATTERN.sub('', content)
    for pattern in LINK_PATTERNS:
        content = re.sub(pattern, '', content)
    return content.strip()

# Function to remove specific phrases from the content
def remove_phrases(content, phrases):
    for phrase in phrases:
        content = content.replace(phrase, '')
    return content


# Function to preprocess Markdown content and extract references
def preprocess_md_content(content):
    # Extract references first
    references = extract_references(content)
    
    # Convert to lower case
    content = content.lower()
    content = remove_phrases(content, phrases_to_remove)
    # General cleanup
    content = re.sub(r'^(CHAPTER|SECTION)\s+\d+.*$', '', content, flags=re.MULTILINE)
    content = re.sub(r'^(Page \d+|Confidential|Do Not Distribute)$', '', content, flags=re.MULTILINE)
    content = re.sub(r'\bPage\s+\d+\b', '', content)
    content = re.sub(r'\s+', ' ', content).strip()
    content = re.sub(r'\*+', ' ', content)
    content = clean_special_characters(content)
    content = remove_copyright_and_links(content)

    # Tokenize and remove custom stopwords
    tokens = word_tokenize(content)
    filtered_tokens = [token for token in tokens if token not in custom_stopwords]

    # Prepare the cleaned content
    cleaned_content = ' '.join(filtered_tokens)

    # Simplify references to a comma-separated list
    simplified_references = ', '.join(set(ref[1] for ref in references))

    return cleaned_content, simplified_references

# Function to extract references using a predefined pattern
def extract_references(content):
    return re.findall(REFERENCE_PATTERN, content, re.IGNORECASE)

# Function to process a Markdown file
def process_md_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Preprocess content and extract simplified references
    cleaned_content, simplified_references = preprocess_md_content(content)
    return cleaned_content, simplified_references

# Function to process all Markdown files in a directory and save to JSON
def process_directory(md_directory, output_json):
    chunks = []
    for md_file in pathlib.Path(md_directory).glob("*.md"):
        cleaned_content, references = process_md_file(md_file)
        chunk = {
            'content': cleaned_content,
            'references': references,
            'file_name': md_file.name
        }
        chunks.append(chunk)
        print(f"Processed: {md_file}")

    # Save processed data to JSON file
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(chunks, json_file, indent=2)
    print(f"Processed data saved to {output_json}")

# Main execution block to process Markdown files in a specified directory
if __name__ == '__main__':
    md_directory = r"D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia"
    output_json = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json'
    process_directory(md_directory, output_json)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_10_Subsection_1001_Section 1001 Administration.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1101_Section 1101 General.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1102_Section 1102 Compliance.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1103_Section 1103 Scoping Requirements.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1104_Section 1104 Accessible Route.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_11_Subsection_1105_Section 1105 Accessible Entrances.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapt

  text = text.encode('utf-8').decode('unicode_escape')


Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_15_Subsection_1507_Section 1507 Requirements for Roof Coverings.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_15_Subsection_1508_Section 1508 Roof Insulation.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_15_Subsection_1509_Section 1509 Roof Coatings.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_15_Subsection_1510_Section 1510 Radiant Barriers Installed Above Deck.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_15_Subsection_1511_Section 1511 Rooftop Structures.md
Processed: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\Virginia\Chapter_15_Subsection_1512_Section 1512 Roofing and Roofing Repair.md
Processed: D:\G_sync\Study\SEM_3\Data_mining

In [203]:
import json
import spacy
from tqdm import tqdm

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def semantic_refinement(chunks):
    """Refines each chunk to ensure semantic coherence."""
    refined_chunks = []
    for chunk in chunks:
        doc = nlp(chunk['content'])
        sentences = [sent.text.strip() for sent in doc.sents]
        refined_chunk = {
            'content': ' '.join(sentences),
            'references': chunk['references'],
            'file_name': chunk['file_name']
        }
        refined_chunks.append(refined_chunk)
    return refined_chunks

def sliding_window_chunking(content, metadata, window_size=1500, stride=250):
    """Applies a sliding window technique to content to standardize chunk sizes based on word count."""
    words = content.split()
    chunks = []
    start = 0
    while start + window_size <= len(words):
        chunk = {
            'content': " ".join(words[start:start + window_size]),
            'references': metadata['references'],
            'file_name': metadata['file_name']
        }
        chunks.append(chunk)
        start += stride
    if start < len(words):  # Handle the last chunk which might be smaller than the window
        chunk = {
            'content': " ".join(words[start:]),
            'references': metadata['references'],
            'file_name': metadata['file_name']
        }
        chunks.append(chunk)
    return chunks

def process_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    processed_chunks = []
    for entry in tqdm(data):
        content = entry['content']
        metadata = {'file_name': entry['file_name'], 'references': entry.get('references', '')}
        initial_chunk = {'content': content, 'references': metadata['references'], 'file_name': metadata['file_name']}
        refined_chunks = semantic_refinement([initial_chunk])
        for refined_chunk in refined_chunks:
            final_chunks = sliding_window_chunking(refined_chunk['content'], metadata=metadata)
            processed_chunks.extend(final_chunks)
    
    return processed_chunks

def save_processed_data(processed_chunks, output_file):
    """Saves processed chunks into a new JSON file."""
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(processed_chunks, file, indent=4)

def main():
    input_path = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json'
    output_path = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json'
    processed_chunks = process_document(input_path)
    save_processed_data(processed_chunks, output_path)
    print("Processing completed successfully. Processed data saved.")

if __name__ == "__main__":
    main()


100%|██████████| 353/353 [00:53<00:00,  6.62it/s]

Processing completed successfully. Processed data saved.





In [204]:
import json
import re
from collections import defaultdict
import string 
import random
# Set the input and output file paths
path = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json'

# def extract_details_from_content(content):
#     # Regular expression to match the structured beginning of the content
#     pattern = re.compile(
#         r'# chapter (?P<chapter_number>\d+) .*? section (?P<section_number>\d+) .*? (?P<section_title>[^.]+)',
#         re.IGNORECASE | re.DOTALL
#     )
#     match = pattern.search(content)
#     if match:
#         # Extracting chapter number, section number, and the title of the section
#         return {
#             "chapter_number": int(match.group('chapter_number')),
#             "section_number": int(match.group('section_number')),
#             "section_title": match.group('section_title').strip()
#         }
#     return None

def extract_numbers_from_filename(file_name):
    # Pattern to capture both numeric and "Appendix_X" chapters
    pattern = r"Chapter_(Appendix_[A-Za-z0-9]+|\d+)_Subsection_([A-Za-z0-9]+)_Section\s*([A-Za-z0-9 ]+)"
    match = re.search(pattern, file_name)
    if match:
        chapter = match.group(1)
        # Check if chapter is an appendix or numeric
        if chapter.startswith("Appendix_"):
            chapter_type = "appendix"
            chapter_number = chapter.split("_")[1]  # Split "Appendix_H" into ["Appendix", "H"]
        else:
            chapter_type = "chapter"
            chapter_number = int(chapter)  # Convert numeric chapter to integer

        return {
            "chapter_type": chapter_type,
            "chapter": chapter_number,
            "subsection": match.group(2),
            "section": match.group(3).strip()  # Remove extra spaces around section identifier
        }

    return {"chapter_type": None, "chapter": None, "subsection": None, "section": None}

def process_json_file(json_filepath):
    with open(json_filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    for chunk in data:
        file_name = chunk.get('file_name', '')
        extracted_numbers = extract_numbers_from_filename(file_name)
        chunk.update(extracted_numbers)
    return data


def generate_random_metadata(metadata):
    if metadata['chapter_type'] is None:
        metadata['chapter_type'] = random.choice(['chapter', 'appendix'])
    
    if metadata['chapter'] is None:
        if metadata['chapter_type'] == 'chapter':
            metadata['chapter'] = random.randint(1, 100)
        else:
            metadata['chapter'] = random.choice(string.ascii_uppercase)
    
    if metadata['subsection'] is None:
        metadata['subsection'] = f"{random.randint(1, 999):03d}"
    
    if metadata['section'] is None:
        metadata['section'] = f"{random.randint(1, 999):03d}"
    
    return metadata
def sort_chunks(chunks):
    def get_sort_key(chunk):
        # Retrieve the values and provide defaults to ensure all keys are comparable
        chapter = chunk.get('chapter', '')
        subsection = chunk.get('subsection', '')
        section = chunk.get('section', '')

        # Normalize all sorting keys to strings to avoid comparison issues between str and int
        chapter_key = str(chapter) if chapter is not None else '99999'  # Use a high number as str for undefined chapters
        subsection_key = str(subsection) if subsection is not None else '99999'
        
        # Assuming sections could be numeric or strings, standardize to string
        section_key = str(section).split()[0] if section else '99999'

        return (chapter_key, subsection_key, section_key)

    return sorted(chunks, key=get_sort_key)

    #return sorted(chunks, key=get_sort_key)
def assign_subsection_chunk_numbers(chunks):
    section_groups = defaultdict(list)
    for chunk in chunks:
        chapter_number = chunk.get('chapter_number', 'unknown')
        section_number = chunk.get('section_number', 'unknown')
        key = (chapter_number, section_number)
        section_groups[key].append(chunk)
    for group_chunks in section_groups.values():
        for idx, chunk in enumerate(group_chunks, start=1):
            chunk['subsection_chunk_number'] = idx
    return chunks

def create_unique_chunk_ids(chunks):
    for chunk in chunks:
        chapter_type = chunk.get('chapter_type', 'unknown')
        chapter = chunk.get('chapter', 'unknown')
        section = chunk.get('section', 'unknown')
        subsection = chunk.get('subsection', 'unknown')
        subsection_chunk_number = chunk.get('subsection_chunk_number', 1)
        chunk['chunk_id'] = f"{chapter_type}_{chapter}_S{section}_SS{subsection}_C{subsection_chunk_number}"
    return chunks
# def create_unique_chunk_ids(chunks):
#     for chunk in chunks:
#         chapter_type = chunk.get('chapter_type', 'unknown')
#         chapter = chunk.get('chapter', 'unknown')
#         section = chunk.get('section', 'unknown')
#         subsection_chunk_number = chunk.get('subsection_chunk_number', 1)
#         chunk['chunk_id'] = f"{chapter_type}_{chapter}_S{section}_SS{subsection_chunk_number}"
#     return chunks

def generate_metadata(chunks):
    for chunk in chunks:
        # Add chapter type to the metadata to distinguish between standard chapters and appendices
        metadata_fields = [
            'chapter_type',
            'chapter',
            'subsection',
            'section',
            'subsection_chunk_number',
            'references',
            'file_name'
        ]
        metadata = {field: chunk.pop(field, None) for field in metadata_fields}
        chunk['metadata'] = metadata
    return chunks


def remove_chapter_details_from_content(chunks):
    chapter_details_pattern = re.compile(
        r'^\s*chapter\s+\d+\s+[a-z\s]+\s+section\s+\d+\s+[a-z\s.,0-9]*?\d+\.\d+\s+[a-z\s]+\.\s*',
        re.IGNORECASE
    )
    for chunk in chunks:
        if 'content' in chunk:
            original_content = chunk['content']
            chunk['content'] = chapter_details_pattern.sub('', original_content).strip()
    return chunks

def main(input_file, output_file):
    # Step 1: Process JSON and extract details from content
    chunks = process_json_file(input_file)

    # Step 2: Sort chunks by chapter, subsection, and section numbers
    chunks = sort_chunks(chunks)

    # Step 3: Assign subsection chunk numbers
    chunks = assign_subsection_chunk_numbers(chunks)
    for chunk in chunks:
        chunk.update(generate_random_metadata(chunk))
    # Step 4: Create unique chunk IDs
    chunks = create_unique_chunk_ids(chunks)

    # Step 5: Move fields into metadata
    chunks = generate_metadata(chunks)

    # Step 6: Remove chapter details from content
    chunks = remove_chapter_details_from_content(chunks)

    # Save the updated JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, indent=2)

    print(f"Processed and updated JSON saved to {output_file}")

if __name__ == '__main__':
    main(path, path)




Processed and updated JSON saved to D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json


In [None]:
# import json
# import re
# from collections import defaultdict
# from collections import defaultdict
# import random
# import string 
# # Set the input and output file paths
# path = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json'

# # def extract_details_from_content(content):
# #     # Regular expression to match the structured beginning of the content
# #     pattern = re.compile(
# #         r'# chapter (?P<chapter_number>\d+) .*? section (?P<section_number>\d+) .*? (?P<section_title>[^.]+)',
# #         re.IGNORECASE | re.DOTALL
# #     )
# #     match = pattern.search(content)
# #     if match:
# #         # Extracting chapter number, section number, and the title of the section
# #         return {
# #             "chapter_number": int(match.group('chapter_number')),
# #             "section_number": int(match.group('section_number')),
# #             "section_title": match.group('section_title').strip()
# #         }
# #     return None

# def extract_numbers_from_filename(file_name):
#     # Pattern to capture both numeric and "Appendix_X" chapters
#     pattern = r"Chapter_(Appendix_[A-Za-z0-9]+|\d+)_Subsection_([A-Za-z0-9]+)_Section\s*([A-Za-z0-9 ]+)"
#     match = re.search(pattern, file_name)
#     if match:
#         chapter = match.group(1)
#         # Check if chapter is an appendix or numeric
#         if chapter.startswith("Appendix_"):
#             chapter_type = "appendix"
#             chapter_number = chapter.split("_")[1]  # Split "Appendix_H" into ["Appendix", "H"]
#         else:
#             chapter_type = "chapter"
#             chapter_number = int(chapter)  # Convert numeric chapter to integer

#         return {
#             "chapter_type": chapter_type,
#             "chapter": chapter_number,
#             "subsection": match.group(2),
#             "section": match.group(3).strip()  # Remove extra spaces around section identifier
#         }

#     return {"chapter_type": None, "chapter": None, "subsection": None, "section": None}

# def process_json_file(json_filepath):
#     with open(json_filepath, 'r', encoding='utf-8') as f:
#         data = json.load(f)
#     for chunk in data:
#         file_name = chunk.get('file_name', '')
#         extracted_numbers = extract_numbers_from_filename(file_name)
#         chunk.update(extracted_numbers)
#     return data

# # def sort_chunks(chunks):
# #     def get_sort_key(chunk):
# #         return (
# #             chunk.get('chapter_number', float('inf')) if chunk.get('chapter_number') is not None else float('inf'),
# #             chunk.get('subsection_number', float('inf')) if chunk.get('subsection_number') is not None else float('inf'),
# #             chunk.get('section_number', float('inf')) if chunk.get('section_number') is not None else float('inf')
# #         )
# #     return sorted(chunks, key=get_sort_key)
# def sort_chunks(chunks):
#     def get_sort_key(chunk):
#         # Retrieve the values and provide defaults to ensure all keys are comparable
#         chapter = chunk.get('chapter', '')
#         subsection = chunk.get('subsection', '')
#         section = chunk.get('section', '')

#         # Normalize all sorting keys to strings to avoid comparison issues between str and int
#         chapter_key = str(chapter) if chapter is not None else '99999'  # Use a high number as str for undefined chapters
#         subsection_key = str(subsection) if subsection is not None else '99999'
        
#         # Assuming sections could be numeric or strings, standardize to string
#         section_key = str(section).split()[0] if section else '99999'

#         return (chapter_key, subsection_key, section_key)

#     return sorted(chunks, key=get_sort_key)

#     #return sorted(chunks, key=get_sort_key)
# def assign_subsection_chunk_numbers(chunks):
#     section_groups = defaultdict(list)
#     for chunk in chunks:
#         chapter_number = chunk.get('chapter_number', 'unknown')
#         section_number = chunk.get('section_number', 'unknown')
#         key = (chapter_number, section_number)
#         section_groups[key].append(chunk)
#     for group_chunks in section_groups.values():
#         for idx, chunk in enumerate(group_chunks, start=1):
#             chunk['subsection_chunk_number'] = idx
#     return chunks
# def assign_default_values(chunks):
#     for chunk in chunks:
#         # Set default or random values if None or missing
#         if chunk.get('chapter_type') is None:
#             chunk['chapter_type'] = "unknown"  # Or any other default logic
#         if chunk.get('chapter') is None:
#             chunk['chapter'] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4))
#         if chunk.get('subsection') is None:
#             chunk['subsection'] = random.randint(1000, 9999)  # Random subsection number
#         if chunk.get('section') is None:
#             chunk['section'] = random.randint(1, 100)  # Random section number
# def create_unique_chunk_ids(chunks):
#     for chunk in chunks:
#         chapter_type = chunk.get('chapter_type', 'unknown')
#         chapter = chunk.get('chapter', 'unknown')
#         section = chunk.get('section', 'unknown')
#         subsection_chunk_number = chunk.get('subsection_chunk_number', 1)
#         # Generate a unique chunk ID based on the available or default data
#         chunk['chunk_id'] = f"{chapter_type}_{chapter}_S{section}_SS{subsection_chunk_number}"
#     return chunks  # Ensure the list is returned after modification

# def generate_metadata(chunks):
#     if chunks is None:
#         return []  # Return an empty list if input is None to prevent errors
#     for chunk in chunks:
#         metadata_fields = [
#             'chapter_type', 'chapter', 'subsection', 'section',
#             'subsection_chunk_number', 'references', 'file_name'
#         ]
#         # Extract metadata and remove it from the main chunk dictionary
#         metadata = {field: chunk.pop(field, None) for field in metadata_fields if field in chunk}
#         chunk['metadata'] = metadata
#     return chunks  # Return the modified list

# def remove_chapter_details_from_content(chunks):
#     if chunks is None:
#         return []  # Return an empty list if input is None to prevent errors
#     chapter_details_pattern = re.compile(...)
#     for chunk in chunks:
#         if 'content' in chunk:
#             original_content = chunk['content']
#             # Replace unwanted chapter details with an empty string
#             chunk['content'] = chapter_details_pattern.sub('', original_content).strip()
#     return chunks  # Return the modified list

# # def create_unique_chunk_ids(chunks):
# #     for chunk in chunks:
# #         # Assuming all required fields are now filled
# #         chapter_type = chunk['chapter_type']
# #         chapter = chunk['chapter']
# #         section = chunk['section']
# #         subsection_chunk_number = chunk.get('subsection_chunk_number', 1)
# #         chunk['chunk_id'] = f"{chapter_type}_{chapter}_S{section}_SS{subsection_chunk_number}"

# # def create_unique_chunk_ids(chunks):
# #     for chunk in chunks:
# #         chapter_type = chunk.get('chapter_type', 'unknown')
# #         chapter = chunk.get('chapter', 'unknown')
# #         section = chunk.get('section', 'unknown')
# #         subsection_chunk_number = chunk.get('subsection_chunk_number', 1)
# #         chunk['chunk_id'] = f"{chapter_type}_{chapter}_S{section}_SS{subsection_chunk_number}"
# #     return chunks

# # def generate_metadata(chunks):
# #     for chunk in chunks:
# #         # Add chapter type to the metadata to distinguish between standard chapters and appendices
# #         metadata_fields = [
# #             'chapter_type',
# #             'chapter',
# #             'subsection',
# #             'section',
# #             'subsection_chunk_number',
# #             'references',
# #             'file_name'
# #         ]
# #         metadata = {field: chunk.pop(field, None) for field in metadata_fields}
# #         chunk['metadata'] = metadata
# #     return chunks


# # def remove_chapter_details_from_content(chunks):
# #     chapter_details_pattern = re.compile(
# #         r'^\s*chapter\s+\d+\s+[a-z\s]+\s+section\s+\d+\s+[a-z\s.,0-9]*?\d+\.\d+\s+[a-z\s]+\.\s*',
# #         re.IGNORECASE
# #     )
# #     for chunk in chunks:
# #         if 'content' in chunk:
# #             original_content = chunk['content']
# #             chunk['content'] = chapter_details_pattern.sub('', original_content).strip()
# #     return chunks

# # def assign_default_values(chunks):
# #     for chunk in chunks:
# #         # Set default or random values if None or missing
# #         if chunk.get('chapter_type') is None:
# #             chunk['chapter_type'] = "unknown"  # Or any other default logic
# #         if chunk.get('chapter') is None:
# #             chunk['chapter'] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4))
# #         if chunk.get('subsection') is None:
# #             chunk['subsection'] = random.randint(1000, 9999)  # Random subsection number
# #         if chunk.get('section') is None:
# # #             chunk['section'] = random.randint(1, 100)  # Random section number
# def generate_metadata(chunks):
#     for chunk in chunks:
#         metadata_fields = [
#             'chapter_type',
#             'chapter',
#             'subsection',
#             'section',
#             'subsection_chunk_number',
#             'references',
#             'file_name'
#         ]
#         metadata = {}
#         for field in metadata_fields:
#             if field in chunk and chunk[field] is not None:
#                 metadata[field] = chunk.pop(field)
#             else:
#                 metadata[field] = assign_default_values(field)
#         chunk['metadata'] = metadata
#     return chunks
# def main(input_file, output_file):
#     # Step 1: Process JSON and extract details from content
#     chunks = process_json_file(input_file)

#     # Step 2: Sort chunks by chapter, subsection, and section numbers
#     chunks = sort_chunks(chunks)

#     # Step 3: Assign subsection chunk numbers
#     chunks = assign_subsection_chunk_numbers(chunks)

#     # Step 4: Create unique chunk IDs
#     chunks = create_unique_chunk_ids(chunks)

#     # Step 5: Move fields into metadata
#     chunks = generate_metadata(chunks)
    
#     # Step 6: Remove chapter details from content
#     chunks = remove_chapter_details_from_content(chunks)

#     # Save the updated JSON
#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(chunks, f, indent=2)

#     print(f"Processed and updated JSON saved to {output_file}")

# if __name__ == '__main__':
#     main(path, path)


AttributeError: 'str' object has no attribute 'get'

In [157]:
import json

# Path to the JSON file
json_file_path = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json'

# Function to count chunks with more than 100 words
def count_large_chunks_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        chunks = json.load(file)  # Load the data from the JSON file
    count = 0
    for chunk in chunks:
        if len(chunk["content"].split()) > 2000:  # Split the text into words and count them
            count += 1
    return count

# Calculate the count of large chunks
large_chunk_count = count_large_chunks_from_file(json_file_path)
print(f"The number of chunks with more than 100 words is: {large_chunk_count}")


The number of chunks with more than 100 words is: 0


In [207]:
import json
import openai
from tqdm import tqdm

# Set OpenAI API key
openai.api_key = "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"


# Function to prepend a custom prompt to the input text
def add_custom_prompt(text):
   prompt = (
        '''Generate a vector embedding for the following text chunk from a building code document. The embedding should:

* Accurately represent the meaning and context of the text.
* Capture the relationships between key technical terms and concepts.
* Emphasize the importance of technical words specific to building codes (e.g., "fire-resistance," "structural integrity," "load-bearing").
* Prioritize words that convey legal obligations and permissions (e.g., "shall," "must," "may").
* Be suitable for use in a retrieval-augmented generation (RAG) system where semantic similarity between chunks is crucial.**Details:**\n'''
    )
   return prompt + text

# Function to generate embeddings using OpenAI API with a custom prompt
def generate_embeddings(data):
    embeddings_with_ids = []
    for chunk in tqdm(data):
        try:
            # Add custom prompt to the text
            text = add_custom_prompt(chunk['content'])
            chunk_id = chunk['chunk_id']
            metadata = chunk['metadata']  # Assuming 'metadata' is correctly formatted and exists in each chunk

            # Generate embedding via OpenAI API
            response = openai.Embedding.create(
                input=text,
                model="text-embedding-3-large"
            )

            # Extract embedding from response
            embedding = response['data'][0]['embedding']

            # Store embeddings with ID and metadata
            embeddings_with_ids.append({
                'chunk_id': chunk_id, 
                'embedding': embedding,
                'metadata': metadata
            })
        except Exception as e:
            print(f"Error processing chunk_id: {chunk.get('chunk_id', 'unknown')}, Error: {str(e)}")
            continue

    return embeddings_with_ids

# Function to load data from a JSON file
def load_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Function to save embeddings to a JSON file
def save_embeddings_to_file(embeddings_with_ids, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(embeddings_with_ids, f, indent=4)

# Function to process data
def process_data(input_file, output_file):
    data = load_data_from_json(input_file)
    print(f"Loaded {len(data)} chunks from {input_file}")
    
    embeddings_with_ids = generate_embeddings(data)
    print(f"Generated embeddings for {len(embeddings_with_ids)} chunks")
    
    save_embeddings_to_file(embeddings_with_ids, output_file)
    print(f"Saved embeddings to {output_file}")

# Specify the input and output file paths
input_json_file = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json'
output_json_file = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\embedding_chunks.json'

# Run the process
process_data(input_json_file, output_json_file)


Loaded 929 chunks from D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json


  0%|          | 0/929 [00:00<?, ?it/s]

100%|██████████| 929/929 [05:36<00:00,  2.76it/s]


Generated embeddings for 929 chunks
Saved embeddings to D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\embedding_chunks.json


In [None]:
# import json

# def load_json_data(input_file):
#     """Load data from a JSON file."""
#     with open(input_file, 'r', encoding='utf-8') as file:
#         data = json.load(file)
#     return data

# def process_chunks(data):
#     """Process each chunk to retain only 'content' and 'chunk_id'."""
#     processed_data = []
#     for item in data:
#         # Extract only the 'content' and 'chunk_id' and ignore other metadata
#         processed_chunk = {
#             'embedding': item['embedding'],
#             'chunk_id': item['chunk_id']
#         }
#         processed_data.append(processed_chunk)
#     return processed_data

# def save_processed_data(processed_data, output_file):
#     """Save the processed data to a new JSON file."""
#     with open(output_file, 'w', encoding='utf-8') as file:
#         json.dump(processed_data, file, indent=4)

# def main():
#     input_file = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\embedding_chunks.json'  # Path to your input JSON file
#     output_file = r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\embedding_chunks.json'  # Path to save the output JSON file

#     # Load data from the input JSON file
#     data = load_json_data(input_file)

#     # Process the data to extract necessary fields
#     processed_data = process_chunks(data)

#     # Save the processed data to a new JSON file
#     save_processed_data(processed_data, output_file)
#     print("Processing completed successfully. Processed data saved to:", output_file)

# if __name__ == "__main__":
#     main()


Processing completed successfully. Processed data saved to: D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\embedding_chunks.json


In [210]:
from pinecone import Pinecone
import json
from tqdm import tqdm

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_m74DA_EoW1zexNTdNSVetA3abdZTgf8su66GumFShesKigC49TLeiHqxwtM6bAyDAWjfQ")

# Connect to your index
index_name = "virginia"
index = pc.Index(index_name)

# Load your data
with open(r'D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\embedding_chunks.json', 'r') as f:
    data = json.load(f)

# Prepare vectors for upsert
vectors_to_upsert = []
batch_size = 100  # Adjust based on your needs and Pinecone limits

for item in tqdm(data):
    try:
        # Extract the embedding (ensure it is a list of floats)
        vector = item['embedding']  # The embedding should already be a list of floats

        # Validate that the embedding is a list of floats
        if not isinstance(vector, list) or not all(isinstance(v, float) for v in vector):
            raise ValueError(f"Invalid embedding format for chunk_id {item['chunk_id']}: {vector}")

        # Prepare the metadata
        metadata = item['metadata']

        # Add the vector to the upsert batch
        vectors_to_upsert.append((item['chunk_id'], vector, metadata))

        # If batch is full, upsert and clear
        if len(vectors_to_upsert) >= batch_size:
            index.upsert(vectors=vectors_to_upsert)
            vectors_to_upsert = []
    except Exception as e:
        print(f"Error processing chunk_id {item.get('chunk_id', 'unknown')}: {str(e)}")
        continue

# Upsert any remaining vectors
if vectors_to_upsert:
    index.upsert(vectors=vectors_to_upsert)

print("Vectors uploaded to Pinecone successfully!")


100%|██████████| 929/929 [00:18<00:00, 49.38it/s]


Vectors uploaded to Pinecone successfully!


In [1]:
querry  = "What are the building code requirements for structures located in historic districts"

In [2]:
import pinecone
import openai
import nltk
from nltk.corpus import stopwords

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

from pinecone import Pinecone
import json
from tqdm import tqdm

# Initialize Pinecone
pc = Pinecone(api_key="pcsk_m74DA_EoW1zexNTdNSVetA3abdZTgf8su66GumFShesKigC49TLeiHqxwtM6bAyDAWjfQ")

# Connect to your index
index_name = "virginia"
index = pc.Index(index_name)


# OpenAI API key setup
openai.api_key = "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"

# Function to preprocess the user query
def preprocess_query(query):
    """
    Preprocess the user query:
    - Convert to lowercase
    - Remove stopwords
    - Strip whitespace
    """
    # Convert query to lowercase and tokenize
    query = query.lower().strip()
    words = query.split()

    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]

    # Join the words back into a string
    preprocessed_query = ' '.join(filtered_words)

    return preprocessed_query

# Function to generate embeddings using OpenAI
def generate_query_embedding(query):
    """
    Generate embedding for the given query using the OpenAI API.
    """
    # Preprocess the query
    query = preprocess_query(query)
    
    # Call OpenAI embedding API
    response = openai.Embedding.create(
        input=query,
        model="text-embedding-3-large"  # OpenAI's recommended embedding model
    )
    
    # Extract embedding
    embedding = response['data'][0]['embedding']
    return embedding

# Function to query Pinecone
def query_pinecone(query_embedding, top_k=3):
    """
    Query the Pinecone index with the given query embedding and retrieve top_k relevant chunks.
    """
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    retrieved_chunks = []
    
    for match in results['matches']:
        retrieved_chunks.append({
            'id': match['id'],
            'score': match['score'],
            #'metadata': match['metadata']
        })
    return retrieved_chunks

# Example Usage
user_query = querry 

# Generate query embedding
query_embedding = generate_query_embedding(user_query)

# Retrieve top 5 relevant chunks
retrieved_chunks = query_pinecone(query_embedding, top_k=5)

# Display the results
print("Retrieved Chunks:")
for chunk in retrieved_chunks:
    print(chunk)


  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Retrieved Chunks:
{'id': 'chapter_2_S202 Definitions_SS202_C319', 'score': 0.466293037}
{'id': 'chapter_13_S1301 General_SS1301_C60', 'score': 0.459113389}
{'id': 'chapter_16_S1604 General Design Requirements_SS1604_C129', 'score': 0.455680847}
{'id': 'chapter_2_S202 Definitions_SS202_C263', 'score': 0.451614946}
{'id': 'chapter_13_S1301 General_SS1301_C61', 'score': 0.442111164}


In [181]:
import json

# Path to your JSON file containing the full content
json_file_path = r"D:\G_sync\Study\SEM_3\Data_mining\Project\streamlit_app_test_upload\app\data\existing_codes\processed_chunks.json"

def get_chunk_content(chunk_id, json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    for item in data:
        if item['chunk_id'] == chunk_id:
            return item.get('content', "Content not found")
    
    return "Chunk ID not found"

# Variable to store retrieved chunks with content
retrieved_chunks_with_content = []

print("Retrieved chunks with content:")
for chunk in retrieved_chunks:
    chunk_id = chunk['id']
    content = get_chunk_content(chunk_id, json_file_path)
    
    # Store the information in a dictionary
    chunk_info = {
        'chunk_id': chunk_id,
        'score': chunk['score'],
        #'metadata': chunk['metadata'],
        'content': content
    }
    
    # Append the dictionary to the list
    retrieved_chunks_with_content.append(chunk_info)

# Optionally print the stored information
for chunk in retrieved_chunks_with_content:
    print(f"chunk_id: {chunk['chunk_id']}")
    print(f"Score: {chunk['score']}")
    #print(f"Metadata: {chunk['metadata']}")
    print(f"Content: {chunk['content']}...")  # Print first 200 characters of content
    print()

Retrieved chunks with content:
chunk_id: chapter_2_S202 Definitions_SS319
Score: 0.46557346
Content: systems required electrical loads interruption primary power could create hazards hamper rescue fire-fighting operations . [ f ] standpipe , types . standpipe types follows : automatic dry . dry standpipe system , normally filled pressurized air , arranged use device , dry pipe valve , admit water system piping automatically upon opening hose valve . water supply automatic dry standpipe system shall capable supplying system demand . automatic wet . wet standpipe system water supply capable supplying system demand automatically . manual dry . dry standpipe system permanent water supply attached system . manual dry standpipe systems require water fire department pumper pumped system fire department connection order meet system demand . manual wet . wet standpipe system connected water supply purpose maintaining water within system water supply capable delivering system demand attached sys

In [182]:
extracted_content = []

# Extracting content from retrieved chunks
for chunk in retrieved_chunks_with_content:
    content = chunk['content']
    extracted_content.append(content)

# Now extracted_content holds all the extracted contents
print("Extracted Content:")
for i, content in enumerate(extracted_content, start=1):
    print(f"Content {i}: {content}")

Extracted Content:
Content 1: systems required electrical loads interruption primary power could create hazards hamper rescue fire-fighting operations . [ f ] standpipe , types . standpipe types follows : automatic dry . dry standpipe system , normally filled pressurized air , arranged use device , dry pipe valve , admit water system piping automatically upon opening hose valve . water supply automatic dry standpipe system shall capable supplying system demand . automatic wet . wet standpipe system water supply capable supplying system demand automatically . manual dry . dry standpipe system permanent water supply attached system . manual dry standpipe systems require water fire department pumper pumped system fire department connection order meet system demand . manual wet . wet standpipe system connected water supply purpose maintaining water within system water supply capable delivering system demand attached system . manual-wet standpipe systems require water fire department pumper

In [183]:
import openai
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"  # Replace with your actual API key

# Initialize OpenAI client
openai.api_key = os.getenv("OPENAI_API_KEY")


def prepare_input(retrieved_chunks_with_content):
    # Combine content from all chunks into a single string
    combined_content = "\n\n".join([f"ID: {chunk['chunk_id']}\nContent: {chunk['content']}" for chunk in retrieved_chunks_with_content])
    
    # Create a prompt for ChatGPT
    prompt = f"The following are relevant documents:\n{combined_content}\n\nBased on this information, check whether the context is enough for answering the question .Question: {querry} \n"
    
    return prompt


def generate_response(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4o",  # Use "gpt-4" for ChatGPT-4
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000,  # Adjust based on how long you want the response to be
        temperature=0.7,  # Adjust creativity level (0.0 - 1.0)
    )
    
    return response['choices'][0]['message']['content']


# Prepare input for ChatGPT
prompt = prepare_input(retrieved_chunks_with_content)

# Generate response from ChatGPT
response = generate_response(prompt)

# Print the response from ChatGPT
print("ChatGPT Response:")
print(response)

ChatGPT Response:
The provided documents do not contain specific information about building code requirements for structures located in historic districts. The excerpts focus on general construction definitions, various structural and design requirements, and regulations related to specific systems and components, but they do not address specific guidelines or exceptions for buildings in historic districts. You may need to consult additional resources or documents related to historic preservation codes or local regulations governing historic districts to get a comprehensive answer.
