**Process (2639 files)**

1. open text file

2. record the section header pattern number.number.number

3. if a section (A) is found within another section (B). A will be copied into B.

In [9]:
import os

def extract_file_name(file_path):
    base_name = os.path.basename(file_path)
    file_name, _ = os.path.splitext(base_name)
    return file_name

import chardet

def detect_encoding(filepath):
    with open(filepath, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        return encoding

def clean_line(line):
    line = re.sub(r'[^\x00-\x7f]', r' ', line)
    line = line.lower()
    line = re.sub(r'[",!?*\[\]]', '', line)
    line = re.sub(r';', ' ', line)
    line = re.sub(r'\\', '', line)
    return line


import re
import pickle

def text_collect(filename):
    pattern = re.compile(r"^\d+\.\d+\.\d+")
    sections = []
    current_section = None
    collected_text = []

    with open(filename, 'r',encoding=detect_encoding(filename),errors='replace') as file:
        for line in file:
            line = clean_line(line)
            match = pattern.match(line)
            if match:
                if current_section:
                    sections.append({
                        "Section": current_section,
                        "Text": ' '.join(collected_text),
                        "Zoning Location": extract_file_name(filename)
                    })
                current_section = match.group()
                collected_text = [line[len(current_section):].strip()]
            elif current_section:
                collected_text.append(line.strip())

        if current_section:
            sections.append({
                "Section": current_section,
                "Text": ' '.join(collected_text),
                "Zoning Location": filename
            })

    with open(f"grouped_text.pkl", 'ab') as pkl_file:
        pickle.dump(sections, pkl_file)

#clear the file
with open("grouped_text.pkl", 'wb') as pkl_file:
    pass  # This will clear the file

import os
import glob

def process_files(directory):
    # Use glob to find all .txt files in the directory and subdirectories
    txt_files = glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True)
    #print(txt_files)
    for file_path in txt_files:
        text_collect(file_path)

# Directory containing the text files
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0'

# Process all text files in the directory
process_files(directory)


In [10]:
with open("grouped_text.pkl", 'rb') as pkl_file:
    sections = pickle.load(pkl_file)

sections

[{'Section': '1.01.010',
  'Text': 'designation and citation of code. ',
  'Zoning Location': 'AK_Homer'},
 {'Section': '1.01.020',
  'Text': 'public copy of code.  prior legislation: ord. 80-3  code 1967   1-100.1.  ',
  'Zoning Location': 'AK_Homer'},
 {'Section': '1.01.010',
  'Text': 'designation and citation of code.  the ordinances included in the following titles chapters articles and sections constitute and are designated as the  code of ordinances city of homer alaska  and may be so cited. this code may also be cited as the  homer city code  or  hcc.  ord. 92-24   1 1992  ord. 82-2   1 1982.  ',
  'Zoning Location': 'AK_Homer'},
 {'Section': '1.01.020',
  'Text': 'public copy of code.  a. a copy of this code shall be kept on file in the office of the city clerk. the city clerk or the clerk s designee shall insert in their designated places all amendments or ordinances that indicate the intention of the city council to make the same a part of such code and to extract from such 

In [12]:
import os
import re

# Compile the regex pattern
pattern = re.compile(r"^\d+\.\d+\.\d+", re.MULTILINE)

# Function to check if the pattern exists in the file
def check_pattern_in_file(file_path):
    with open(file_path, 'r',encoding=detect_encoding(file_path),errors='replace') as file:
        contents = file.read()
        if pattern.search(contents):
            print(f"Pattern found in {file_path}.")
        else:
            print(f"Pattern not found in {file_path}.")

# Function to recursively search for .txt files and check the pattern
def check_pattern_in_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                check_pattern_in_file(file_path)

# Directory path
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0'

# Check pattern in all .txt files in the directory and subdirectories
check_pattern_in_directory(directory)

Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AK_Homer.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\CA_SanLuisObispo.txt.


In [None]:
import os
import re
import pickle

# Compile the regex pattern
pattern = re.compile(r"^\d+\.\d+\.\d+", re.MULTILINE)

# Function to check if the pattern exists in the file
def check_pattern_in_file(file_path):
    with open(file_path, 'r', encoding=detect_encoding(file_path), errors='replace') as file:
        contents = file.read()
        return bool(pattern.search(contents))

# Function to recursively search for .txt files and check the pattern
def check_pattern_in_directory(directory):
    matches = []
    non_matches = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                if check_pattern_in_file(file_path):
                    print(f"Pattern found in {file_path}.")
                    matches.append(file_path)
                else:
                    print(f"Pattern not found in {file_path}.")
                    non_matches.append(file_path)
    
    # Save the lists to .pkl files
    with open('matches.pkl', 'wb') as f:
        pickle.dump(matches, f)
    
    with open('non_matches.pkl', 'wb') as f:
        pickle.dump(non_matches, f)

# Directory path
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1'

# Check pattern in all .txt files in the directory and subdirectories
check_pattern_in_directory(directory)

Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Anchorage.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Homer.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Kenai.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Palmer.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Petersburg.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Sitka.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Yakutat.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AL_Athens.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AL_Auburn.txt.
Pattern not found in C:\Users\clint\Desktop\Tr