### Preliminary Data Wrangling. The objectives of this file are to connect REF files with Raw Data Downloads
* Note: This should work for 2020 - 2023 data

Importing necessary packages 

In [None]:
#Importing necessary packages
import pandas as pd
import os 
import numpy as np
import warnings
warnings.simplefilter("ignore")

Loading in all raw data & REF files and the REF files by storing them into dictionaries (HELPER FUNCTION)

In [27]:
def load_data(directory, year):
    """
    Reads all CSV files in the specified directory into a dictionary (RAWDATA) 
    and loads all sheets from an Excel file into another dictionary (REF).
    
    Args:
        directory (str): The path to the directory containing the raw data files.
        year (int or str): The year to append to dictionary keys.
    
    Returns:
        tuple: (RAWDATA, REF)
    """
    
    # Change to the specified directory
    os.chdir(directory)

    # Store all raw CSV data in a dictionary with dynamic year
    csv_files = [f for f in os.listdir() if f.endswith('.csv')]
    RAWDATA = {file.split('.')[0].lower(): pd.read_csv(file) for file in csv_files}

    # Look for an Excel file (assuming there's only one Excel file in the directory)
    excel_files = [f for f in os.listdir() if f.endswith('.xlsx') or f.endswith('.xls')]
    
    REF = {}
    if excel_files:
        file_path = excel_files[0]  # Taking the first Excel file found
        sheetname_ref = pd.ExcelFile(file_path).sheet_names
        REF = {f"{sheet}_ref{year}": pd.read_excel(file_path, sheet_name=sheet) for sheet in sheetname_ref}

    # Return both dictionaries
    return RAWDATA, REF

rawdata_23, ref_23 = load_data(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\Data_Scraping\Raw Data\raw_data2023", 2023)

Mapping the column ID names in the raw data with the actual column names in the REF file 

In [29]:
def rename_columns_using_ref(rawdata, ref):
    """
    Renames columns in each DataFrame in rawdata using the corresponding mapping found in ref.

    Args:
        rawdata (dict): Dictionary containing raw DataFrames with keys as filenames.
        ref (dict): Dictionary containing reference DataFrames with keys as filenames.

    Returns:
        dict: Dictionary containing renamed DataFrames.
    """
    
    updated_data = {}  # Dictionary to store updated DataFrames

    for raw_key, raw_df in rawdata.items():
        # Extract base name before the first underscore (_)
        base_name = raw_key.split("_")[0]
        
        # Find the matching key in REF (case-insensitive)
        matching_key = next((key for key in ref if key.lower().startswith(base_name.lower())), None)
        
        if matching_key:
            # Extract mapping from REF (second column = column ID, third column = actual column name)
            ref_df = ref[matching_key]
            column_mapping = dict(zip(ref_df.iloc[:, 1], ref_df.iloc[:, 2]))  # Map column ID → Actual name
            
            # Rename columns in RAWDATA DataFrame
            renamed_df = raw_df.rename(columns=column_mapping)
            
            # Store in updated_data with the original key
            updated_data[raw_key] = renamed_df

    # Print confirmation
    print(f"Renamed and stored {len(updated_data)} DataFrames.")

    return updated_data

# Example Usage:
data_23 = rename_columns_using_ref(rawdata_23, ref_23)


Renamed and stored 5 DataFrames.


Old Code without making it into a function. 

In [None]:
os.chdir(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\Data_Scraping\Raw Data\raw_data2023")

### Storing all raw data files in a dictionary ### 
# Get a list of all CSV files in the directory (These are the raw data files )
csv_files = [f for f in os.listdir() if f.endswith('.csv')]

# Read each CSV file and store it in a dictionary with lowercase keys
RAWDATA_23 = {file.split('.')[0].lower(): pd.read_csv(file) for file in csv_files}


### Storing all ref files in a dictionary ### 
# Define file path for the Excel file
file_path = r"TAPR_district_adv_2023.xlsx"

# Get all sheet names
sheetname_ref = pd.ExcelFile(file_path).sheet_names

# Read all sheets into a dictionary with modified keys
REF_23 = {sheet + "_ref2023": pd.read_excel(file_path, sheet_name=sheet) for sheet in sheetname_ref}

# Display the updated dictionary keys
print("Updated sheet names:", REF_23.keys())


# Create a dictionary to store comparison results
shape_comparison = {}

# Iterate through each file in RAWDATA_23
for raw_key, raw_df in RAWDATA_23.items():
    # Extract the base name before the first underscore (_)
    base_name = raw_key.split("_")[0]
    
    # Find the matching key in REF_23 (case-insensitive match)
    matching_key = next((key for key in REF_23 if key.lower().startswith(base_name.lower())), None)
    
    if matching_key:
        # Get the corresponding DataFrame from REF_23
        ref_df = REF_23[matching_key]
        
        # Compare shapes
        shape_comparison[raw_key] = {
            "RAWDATA_23_shape": raw_df.shape,
            "REF_23_shape": ref_df.shape,
        }
    else:
        shape_comparison[raw_key] = {
            "RAWDATA_23_shape": raw_df.shape,
            "REF_23_shape": None,
        }

# Convert results into a DataFrame for easy viewing
comparison_df = pd.DataFrame.from_dict(shape_comparison, orient='index')
print(comparison_df)

DATA23 = {}  # Dictionary to store updated DataFrames

for raw_key, raw_df in RAWDATA_23.items():
    # Extract base name before the first underscore (_)
    base_name = raw_key.split("_")[0]
    
    # Find the matching key in REF_23 (case-insensitive)
    matching_key = next((key for key in REF_23 if key.lower().startswith(base_name.lower())), None)
    
    if matching_key:
        # Extract mapping from REF_23 (second column = column ID, third column = actual column name)
        ref_df = REF_23[matching_key]
        column_mapping = dict(zip(ref_df.iloc[:, 1], ref_df.iloc[:, 2]))  # Map column ID → Actual name
        
        # Rename columns in RAWDATA DataFrame
        renamed_df = raw_df.rename(columns=column_mapping)
        
        # Store in DATA23 with the original key
        DATA23[raw_key] = renamed_df

# Print confirmation
print(f"Renamed and stored {len(DATA23)} DataFrames in DATA23.")


