In [None]:
import re
import pandas as pd
from PyPDF2 import PdfReader
from pathlib import Path
import pdb

In [None]:


# Define years and file paths
years = [2020, 2021, 2022, 2023, 2024, 2025]
file_paths = {year: f"/Users/nicolasroever/Dropbox/Promotion/LVT/landvaluetax/src/landvaluetax/data/lvt_rates/maamaksumaarad-{year}.pdf" for year in years}

# Process each PDF and convert to CSV
for year in years:
    reader = PdfReader(file_paths[year])
    
    text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
    lines = text.splitlines()
    records = []
    
    if year <= 2023:
        if year == 2020:
            # 2020 includes municipality code
            pattern = r'^\s*(\d+)\s+(.+?)\s{2,}(\d+,\d+|\d+)\s+(\d+,\d+|\d+)\s*$'
            for line in lines:
                m = re.match(pattern, line)
                if m:

                    code, name, gen, ag = m.groups()
                    records.append({
                        'municipality_code': code,
                        'municipality_name': name,
                        'general_rate': gen.replace(',', '.'),
                        'agricultural_rate': ag.replace(',', '.'),
                        'year': year
                    })
        else:
            # 2021-2023 no code column
            pattern = r'^\s*(.+?)\s{2,}(\d+,\d+|\d+)\s+(\d+,\d+|\d+)\s*$'
            for line in lines:
                m = re.match(pattern, line)
                if m:
                    name, gen, ag = m.groups()
                    records.append({
                        'municipality_name': name,
                        'general_rate': gen.replace(',', '.'),
                        'agricultural_rate': ag.replace(',', '.'), 
                        'year': year
                    })
    else:
        # 2024-2025 with three land-use categories
        pattern = r'^\s*(.+?)\s{2,}(\d+(?:,\d+)?)\s+(\d+(?:,\d+)?)\s+(\d+(?:,\d+)?)\s*$'
        for line in lines:
            m = re.match(pattern, line)
            if m:
                name, res, ag, oth = m.groups()
                records.append({
                    'municipality_name': name,
                    'general_rate': res.replace(',', '.'),
                    'agricultural_land_rate': ag.replace(',', '.'),
                    'other_purpose_land_rate': oth.replace(',', '.'), 
                    'year': year
                })
    
    # Create DataFrame and save CSV
    df = pd.DataFrame(records)
    output_path = f"land_tax_{year}.csv"
    df.to_csv(output_path, index=False)

# Print output file locations
print("CSV files created for land tax rates:")
for year in years:
    print(f"/mnt/data/land_tax_{year}.csv")