**Process (2639 files)**

1. open text file

2. record the section header pattern number.number.number

3. if a section (A) is found within another section (B). A will be copied into B.

In [3]:
import os
import re
import pickle

pkl_name= "grouped_text.pkl"
#clear the file
with open(pkl_name, 'wb') as pkl_file:
    pass  # This will clear the file

def extract_file_name(file_path):
    base_name = os.path.basename(file_path)
    file_name, _ = os.path.splitext(base_name)
    return file_name

import chardet

def detect_encoding(filepath):
    with open(filepath, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        return encoding

def clean_line(line):
    line = re.sub(r'[^\x00-\x7f]', r' ', line)
    line = line.lower()
    line = re.sub(r'[",!?*\[\]]', '', line)
    line = re.sub(r';', ' ', line)
    line = re.sub(r'\\', '', line)
    return line

import pickle

def add_entries_to_dataset(file_path, new_entries):
    # Step 1: Check if the file is empty or does not exist
    if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
        data = []
    else:
        # Step 2: Load the existing dataset from the .pkl file
        with open(file_path, 'rb') as file:
            data = pickle.load(file)

    # Step 3: Add new entries to the dataset
    data.extend(new_entries)

    # Step 4: Save the updated dataset back to the .pkl file
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

    print(f"New entries added and dataset saved to {file_path}")

import re
pattern = re.compile(r"(?i)^\d+\.\d+\.\d+|^sec\. \d+-\d+[A-Z]?-\d+|^naics \d+|^section \d+|^§ \d+\.\d+|^� \d+-\d+-\d+")

def text_collect(filename):
    global pattern
    sections = []
    current_section = None
    collected_text = []

    with open(filename, 'r',encoding=detect_encoding(filename),errors='replace') as file:
        for line in file:
            line = clean_line(line)
            match = pattern.match(line)
            if match:
                if current_section:
                    sections.append({
                        "Section": current_section,
                        "Text": ' '.join(collected_text),
                        "Zoning Location": extract_file_name(filename)
                    })
                current_section = match.group()
                collected_text = [line[len(current_section):].strip()]
            elif current_section:
                collected_text.append(line.strip())

        if current_section:
            sections.append({
                "Section": current_section,
                "Text": ' '.join(collected_text),
                "Zoning Location": extract_file_name(filename)
            })

    add_entries_to_dataset(pkl_name, sections)

import os
import glob

def process_files(directory):
    # Use glob to find all .txt files in the directory and subdirectories
    txt_files = glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True)
    for file_path in txt_files:
        print(f"Processing {file_path}")
        text_collect(file_path)

# Directory containing the text files
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0'

# Process all text files in the directory
process_files(directory)


Processing C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AK_Homer.txt
New entries added and dataset saved to grouped_text.pkl
Processing C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AL_Athens.txt
New entries added and dataset saved to grouped_text.pkl
Processing C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AL_Auburn.txt
New entries added and dataset saved to grouped_text.pkl
Processing C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\CA_SanLuisObispo.txt
New entries added and dataset saved to grouped_text.pkl
Processing C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\WA_Redmond.txt
New entries added and dataset saved to grouped_text.pkl


In [4]:
import pickle

def get_unique_zoning_locations(file_path):
    # Step 1: Load the dataset from the .pkl file
    with open(file_path, 'rb') as file:
        dummy_data = pickle.load(file)

    # Step 2: Extract the "Zoning Location" values
    zoning_locations = [entry["Zoning Location"] for entry in dummy_data]

    # Step 3: Find unique entries using a set
    unique_zoning_locations = set(zoning_locations)

    return unique_zoning_locations

# Example usage
print("Unique Zoning Locations:", get_unique_zoning_locations(pkl_name))

Unique Zoning Locations: {'CA_SanLuisObispo', 'AL_Auburn', 'WA_Redmond', 'AK_Homer', 'AL_Athens'}


In [5]:
with open(pkl_name, 'rb') as file:
    data = pickle.load(file)

# Filter the data
filtered_data = [item for item in data if item.get("Zoning Location") == "AL_Athens"]

# Print or process the filtered data
for item in filtered_data:
    print(item)

{'Section': '4.3.1', 'Text': '-4.3.5 50-146-50-150   4.4-4.7 50-151-50-154   5.1-5.5 50-155-50-159   6.1 6.2 50-160 50-161 823 7-27-1981 1 2 62-8 62-9   3-5 62-8 839 9-13-1982 2 14-1 840 10- 4-1982 1 2-62   3 2-64 841 10- 4-1982  34-2 848 1-10-1983  62-4 862 7-11-1983 1-12 46-191-46-202 865 8-22-1983 1-5 1-8 866 9-12-1983  54-5 885 10- 1-1984 1 2-62   3 2-64 897 3-11-1985  18-52 903 4-22-1985 1.1 42-31   1.3 42-32   2.1 2.2 42-52 42-53   3.1-3.3 42-71-42-73 912 8-26-1985  50-134 965 10- 6-1986 1 2-62 971 11-11-1986 art. i   1 10-1 10-2   art. i   2    art. i   3 10-6   art. i   4 10-5   art. i   5 10-7   art. i   6 10-4    10-8   art. i   8- 10-9-10-11   art. i   10    art. i   12 10-13   art. ii   1- 10-31-10-33   art. ii   3    art. ii   5- 10-35-10-37   art. ii   7    art. ii   9 10-38 10-39   art. ii   10 10-40 10-41   art. ii   11    art. iii   3- 10-141-10-143   art. iii   5    art. iv   1 10-71   art. iv   8 10-72   art. v   1 10-77 10-78   art. v   2    art. v   3 10-76   art. 

In [7]:
import os
import re

def add_multiline_flag(pattern):
    return re.compile(pattern.pattern, re.MULTILINE)

modified_pattern = add_multiline_flag(pattern)


# Function to check if the pattern exists in the file
def check_pattern_in_file(file_path):
    global modified_pattern
    with open(file_path, 'r',encoding=detect_encoding(file_path),errors='replace') as file:
        contents = file.read()
        #if modified_pattern.search(contents):
            #print(f"Pattern found in {file_path}.")
        #else:
        if not modified_pattern.search(contents):
            print(f"Pattern not found in {file_path}.")

# Function to recursively search for .txt files and check the pattern
def check_pattern_in_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                check_pattern_in_file(file_path)

# Directory path
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1'

# Check pattern in all .txt files in the directory and subdirectories
check_pattern_in_directory(directory)

#look at birmingham and chicksaw


Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AR_Rogers.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AZ_Tolleson.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\CA_Needles.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\CA_Ojai.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\CT_Ashford.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\FL_Marianna.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\GA_Donalsonville.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\GA_Fayetteville.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\GA_Jonesboro.txt.