In [1]:
import os
import openpyxl
import xlrd
import pandas as pd
import json
from dbfread import DBF

In [2]:
root = 'data/latest_sales_data'

# Initialize an empty dictionary to hold dataframes
dataframes = {}

# Define a function to load a file into a dataframe
def load_file_to_df(path, filename):
    if filename.endswith('.xlsx') or filename.endswith('.xls'):
        return pd.read_excel(os.path.join(path, filename))
    elif filename.endswith('.csv'):
        return pd.read_csv(os.path.join(path, filename), on_bad_lines='skip')
    elif filename.endswith('.json') or filename.endswith('.geojson'):
        with open(os.path.join(path, filename)) as f:
            data = json.load(f)
        return pd.json_normalize(data)
    elif filename.endswith('.dbf'):
        dbf = DBF(os.path.join(path, filename))
        return pd.DataFrame(iter(dbf))
    else:
        return None

# Walk through each directory
for root, dirs, files in os.walk(root):
    for file in files:
        # Ignore metadata files
        if 'metadata' not in file.lower():
            print(f'Loading {file} from {root}')
            df = load_file_to_df(root, file)
            if df is not None:
                # Use the directory name as the key
                key = os.path.basename(root)
                # If key already exists, append the new dataframe
                if key in dataframes:
                    dataframes[key] = pd.concat([dataframes[key], df])
                else:
                    dataframes[key] = df

Loading Alachua.xlsx from data/latest_sales_data/Alachua County
Loading 2024 Sales.xlsx from data/latest_sales_data/Baker County 
Loading 2023 Sales.xlsx from data/latest_sales_data/Baker County 
Loading Bay County.xlsx from data/latest_sales_data/Bay County


  warn("""Cannot parse header or footer so it will be ignored""")


Loading Bradford.csv from data/latest_sales_data/Bradford County
Loading 2024 Sales Data.xlsx from data/latest_sales_data/Calhoun County
Loading 2023 Sales Data.xlsx from data/latest_sales_data/Calhoun County
Loading Charlotte County.xls from data/latest_sales_data/Charlotte County
Loading Citrus2024.xlsx from data/latest_sales_data/Citrus County 
Loading Citrus2023.xlsx from data/latest_sales_data/Citrus County 
Loading 2024 Sales YTD.xlsx from data/latest_sales_data/Clay County 
Loading Collier.csv from data/latest_sales_data/Collier County
Loading Columbia 2024 .xlsx from data/latest_sales_data/Columbia County 
Loading Colombia 2023 .xlsx from data/latest_sales_data/Columbia County 
Loading Desoto 2024.xlsx from data/latest_sales_data/DeSoto County 
Loading Desoto 2023 .xlsx from data/latest_sales_data/DeSoto County 
Loading Dixie.xlsx from data/latest_sales_data/Dixie County 
Loading Escambia.csv from data/latest_sales_data/Escambia County 
Loading Flagler.xlsx from data/latest_sal

In [3]:
for key in dataframes.keys():
    print(key)
    dataframes[key].info(show_counts=True)

Alachua County
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4293 entries, 0 to 4292
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Parcel ID                4293 non-null   object 
 1   Address                  3958 non-null   object 
 2   Sale Date                4293 non-null   object 
 3   Sale Price               4293 non-null   object 
 4   Qualified Sales          4293 non-null   object 
 5   Reason                   4293 non-null   object 
 6   Book                     4293 non-null   int64  
 7   Page                     4293 non-null   object 
 8   Instrument               4293 non-null   object 
 9   Acres                    4293 non-null   float64
 10  Property Use             4283 non-null   object 
 11  Tax District             4293 non-null   object 
 12  Just Value               4293 non-null   object 
 13  Year  Built              3276 non-null   float64
 14  Exterior 

In [4]:
root = 'data/latest_sales_data'

# Initialize an empty dictionary to hold metadata dataframes
metadata_dfs = {}

# Define a function to load a file into a dataframe
def load_metadata_file_to_df(path, filename):
    if filename.endswith('.xlsx') or filename.endswith('.xls'):
        return pd.read_excel(os.path.join(path, filename))
    elif filename.endswith('.csv'):
        return pd.read_csv(os.path.join(path, filename))
    elif filename.endswith('.json') or filename.endswith('.geojson'):
        with open(os.path.join(path, filename)) as f:
            data = json.load(f)
        return pd.json_normalize(data)
    elif filename.endswith('.dbf'):
        dbf = DBF(os.path.join(path, filename))
        return pd.DataFrame(iter(dbf))
    else:
        return None

# Walk through each directory
for root, dirs, files in os.walk('.'):
    for file in files:
        # Only load metadata files
        if 'metadata' in file.lower():
            df = load_metadata_file_to_df(root, file)
            if df is not None:
                # Use the directory name as the key
                key = os.path.basename(root)
                # If key already exists, append the new dataframe
                if key in metadata_dfs:
                    metadata_dfs[key] = pd.concat([metadata_dfs[key], df])
                else:
                    metadata_dfs[key] = df

In [5]:
for key in metadata_dfs.keys():
    print(key)
    metadata_dfs[key].info(show_counts=True)

Alachua County
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Original Column Name   3 non-null      object
 1   Effective Column Name  3 non-null      object
dtypes: object(2)
memory usage: 180.0+ bytes
Baker County 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Original Column Name   3 non-null      object
 1   Effective Column Name  3 non-null      object
dtypes: object(2)
memory usage: 180.0+ bytes
Bay County
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Original Column Name   3 non-null      object
 1   Effe

In [6]:
# Iterate over each key in the dataframes dictionary
for key in dataframes.keys():
    # Get the current dataframe and its corresponding metadata dataframe
    df = dataframes[key]
    metadata_df = metadata_dfs[key]

    # Create a dictionary mapping original column names to effective column names
    # Use strip to remove leading and trailing spaces
    rename_dict = pd.Series(metadata_df['Effective Column Name'].str.strip().values, index=metadata_df['Original Column Name'].str.strip()).to_dict()

    # Select only the columns that are in the rename_dict keys and rename them
    try:
        dataframes[key] = df[rename_dict.keys()].rename(columns=rename_dict)
    except KeyError as e:
        print(f"Error in {key} metadata: {str(e)}. Please check the 'Original Column Name' in the metadata file.")

In [7]:
# Concatenate all dataframes in the dictionary
combined_df = pd.concat(dataframes.values(), ignore_index=True)

In [8]:
combined_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2470319 entries, 0 to 2470318
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   ParcelID    127467 non-null   object
 1   Sale Date   2441977 non-null  object
 2   Sale Price  2470319 non-null  object
 3   PIN         2342725 non-null  object
dtypes: object(4)
memory usage: 75.4+ MB


In [9]:
combined_df.to_csv('data/compiled_latest_sales_data.csv', index=False)