**Process (2639 files)**

1. open text file

2. record the section header pattern number.number.number

3. if a section (A) is found within another section (B). A will be copied into B.

In [34]:
import os
import re
import pickle

pkl_name= "grouped_text.pkl"
#clear the file
with open(pkl_name, 'wb') as pkl_file:
    pass  # This will clear the file


def extract_file_name(file_path):
    base_name = os.path.basename(file_path)
    file_name, _ = os.path.splitext(base_name)
    return file_name

import chardet

def detect_encoding(filepath):
    with open(filepath, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        return encoding

def clean_line(line):
    line = re.sub(r'[^\x00-\x7f]', r' ', line)
    line = line.lower()
    line = re.sub(r'[",!?*\[\]]', '', line)
    line = re.sub(r';', ' ', line)
    line = re.sub(r'\\', '', line)
    return line



def text_collect(filename):
    pattern = re.compile(r"^\d+\.\d+\.\d+|^sec\. \d+-\d+\.")
    sections = []
    current_section = None
    collected_text = []

    with open(filename, 'r',encoding=detect_encoding(filename),errors='replace') as file:
        for line in file:
            line = clean_line(line)
            match = pattern.match(line)
            if match:
                if current_section:
                    sections.append({
                        "Section": current_section,
                        "Text": ' '.join(collected_text),
                        "Zoning Location": extract_file_name(filename)
                    })
                current_section = match.group()
                collected_text = [line[len(current_section):].strip()]
            elif current_section:
                collected_text.append(line.strip())

        if current_section:
            sections.append({
                "Section": current_section,
                "Text": ' '.join(collected_text),
                "Zoning Location": extract_file_name(filename)
            })

    add_entries_to_dataset(pkl_name, current_section)


import os
import glob

def process_files(directory):
    # Use glob to find all .txt files in the directory and subdirectories
    txt_files = glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True)
    for file_path in txt_files:
        print(f"Processing {file_path}")
        text_collect(file_path)

# Directory containing the text files
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0'

# Process all text files in the directory
process_files(directory)


Processing C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AL_Auburn.txt
New entries added and dataset saved to grouped_text.pkl
Processing C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\CA_SanLuisObispo.txt
New entries added and dataset saved to grouped_text.pkl


In [36]:
import pickle
from pprint import pprint

# Assuming pkl_name is defined somewhere in your code
# Load the .pkl file
with open(pkl_name, 'rb') as file:
    data = pickle.load(file)

# Print the type of the data
print(f"Type of data: {type(data)}")

# Pretty-print the data for better readability
pprint(data)

Type of data: <class 'list'>
['s',
 'e',
 'c',
 '.',
 ' ',
 '2',
 '3',
 '-',
 '5',
 '6',
 '.',
 '1',
 '7',
 '.',
 '1',
 '6',
 '0',
 '.',
 '0',
 '1',
 '0']


In [26]:
# Open the file in write mode to clear its contents
with open('dummy_dataset.pkl', 'wb') as file:
    pass  # This will create an empty file

In [27]:
import pickle

import os
import pickle

def add_entries_to_dataset(file_path, new_entries):
    # Step 1: Check if the file is empty or does not exist
    if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
        data = []
    else:
        # Step 2: Load the existing dataset from the .pkl file
        with open(file_path, 'rb') as file:
            data = pickle.load(file)

    # Step 3: Add new entries to the dataset
    data.extend(new_entries)

    # Step 4: Save the updated dataset back to the .pkl file
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

    print(f"New entries added and dataset saved to {file_path}")

# Example usage
new_entries = [
    {
        "Section": "Summary",
        "Text": "This is a dummy text for the summary section.",
        "Zoning Location": "Zone C"
    },
    {
        "Section": "Introduction",
        "Text": "This is a dummy text for the introduction section.",
        "Zoning Location": "Zone A"
    }
]
add_entries_to_dataset('dummy_dataset.pkl', new_entries)

New entries added and dataset saved to dummy_dataset.pkl


In [29]:
import pickle

def get_unique_zoning_locations(file_path):
    # Step 1: Load the dataset from the .pkl file
    with open(file_path, 'rb') as file:
        dummy_data = pickle.load(file)

    # Step 2: Extract the "Zoning Location" values
    zoning_locations = [entry["Zoning Location"] for entry in dummy_data]

    # Step 3: Find unique entries using a set
    unique_zoning_locations = set(zoning_locations)

    return unique_zoning_locations

# Example usage
file_path = "dummy_dataset.pkl"
unique_locations = get_unique_zoning_locations(file_path)
print("Unique Zoning Locations:", unique_locations)

Unique Zoning Locations: {'Zone C', 'Zone A'}


In [2]:
import pickle

# Read all entries from the pickle file
entries = []
with open('grouped_text.pkl', 'rb') as pkl_file:
    while True:
        try:
            entries.append(pickle.load(pkl_file))
        except EOFError:
            break


# Extract unique Zoning Location entries
unique_zoning_locations = set()
for entry in entries:
    if isinstance(entry, dict) and 'Zoning Location' in entry:
        unique_zoning_locations.add(entry['Zoning Location'])
    elif isinstance(entry, list):
        for sub_entry in entry:
            if isinstance(sub_entry, dict) and 'Zoning Location' in sub_entry:
                unique_zoning_locations.add(sub_entry['Zoning Location'])

# Print unique Zoning Location entries
for location in unique_zoning_locations:
    print(location)

CA_SanLuisObispo
AL_Auburn


In [128]:
import pickle

# Read all entries from the pickle file
entries = []
with open('grouped_text.pkl', 'rb') as pkl_file:
    while True:
        try:
            entries.append(pickle.load(pkl_file))
        except EOFError:
            break

# Extract unique Zoning Location entries
unique_zoning_locations = set(entry['Zoning Location'] for entry in entries)

# Print unique Zoning Location entries
for location in unique_zoning_locations:
    print(location)

TypeError: list indices must be integers or slices, not str

In [44]:
import re
import os

def analyze_file(file_path):
    # Compile the regex pattern
    pattern = re.compile(r"^\d+\.\d+\.\d+|Sec\. \d+-\d+", re.MULTILINE)
    
    # Check if the file exists
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        return
    
    # Open and read the file
    with open(file_path, 'r',encoding=detect_encoding(file_path),errors='replace') as file:
        content = file.read()
    
    # Search for the pattern in the file content
    if pattern.search(content):
        print(True)
    else:
        print(False)

# Example usage
file_path = r"C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AL_Auburn.txt"
analyze_file(file_path)

True


In [7]:
import os
import re

# Compile the regex pattern
pattern = re.compile(r"^\d+\.\d+\.\d+|Sec\. \d+-\d+", re.MULTILINE)

# Function to check if the pattern exists in the file
def check_pattern_in_file(file_path):
    with open(file_path, 'r',encoding=detect_encoding(file_path),errors='replace') as file:
        contents = file.read()
        if pattern.search(contents):
            print(f"Pattern found in {file_path}.")
        else:
            print(f"Pattern not found in {file_path}.")

# Function to recursively search for .txt files and check the pattern
def check_pattern_in_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                check_pattern_in_file(file_path)

# Directory path
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0'

# Check pattern in all .txt files in the directory and subdirectories
check_pattern_in_directory(directory)

Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AK_Homer.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\AL_Auburn.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\set0\CA_SanLuisObispo.txt.


In [6]:
import re

# Compile the regex pattern
pattern = re.compile(r"^\d+\.\d+\.\d+|Sec\. \d+-\d+", re.MULTILINE)

def check_pattern(text):
    # Check if the pattern matches the input text
    if pattern.search(text):
        return True
    else:
        return False

# Sample input text
sample_text = "1.2.3\nSec. 4-5"

# Test the function
print(check_pattern(sample_text))  # Output: True or False based on the match

True


In [None]:
import os
import re
import pickle

# Compile the regex pattern
pattern = re.compile(r"^\d+\.\d+\.\d+", re.MULTILINE)

# Function to check if the pattern exists in the file
def check_pattern_in_file(file_path):
    with open(file_path, 'r', encoding=detect_encoding(file_path), errors='replace') as file:
        contents = file.read()
        return bool(pattern.search(contents))

# Function to recursively search for .txt files and check the pattern
def check_pattern_in_directory(directory):
    matches = []
    non_matches = []
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                if check_pattern_in_file(file_path):
                    print(f"Pattern found in {file_path}.")
                    matches.append(file_path)
                else:
                    print(f"Pattern not found in {file_path}.")
                    non_matches.append(file_path)
    
    # Save the lists to .pkl files
    with open('matches.pkl', 'wb') as f:
        pickle.dump(matches, f)
    
    with open('non_matches.pkl', 'wb') as f:
        pickle.dump(non_matches, f)

# Directory path
directory = r'C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1'

# Check pattern in all .txt files in the directory and subdirectories
check_pattern_in_directory(directory)

Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Anchorage.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Homer.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Kenai.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Palmer.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Petersburg.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Sitka.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AK_Yakutat.txt.
Pattern found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AL_Athens.txt.
Pattern not found in C:\Users\clint\Desktop\Trucks\Trucks\nzlud\municipal_codes_all\real\set1\AL_Auburn.txt.
Pattern not found in C:\Users\clint\Desktop\Tr