This code document will read in clean district datasets and merge it all into one yearly dataset. 

In [None]:
#Loading in necessary packages 
import pandas as pd
import os

In [None]:
def load_cleaned_datasets(directory: str) -> dict:
    """
    Loads all CSV files from the given directory into a dictionary of pandas DataFrames.
    
    - Removes any dataset whose filename contains "ref_".
    - Moves any dataset whose filename contains "district_type" to the end of the dictionary. (Helps with merging)
    
    Parameters:
        directory (str): Path to the directory containing the CSV files.
    
    Returns:
        dict: A dictionary where keys are filenames (without .csv) and values are pandas DataFrames.
    """
    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

    # Load each CSV into a dictionary with filename (without extension) as key
    dfs = {os.path.splitext(f)[0]: pd.read_csv(os.path.join(directory, f)) for f in csv_files}

    # Remove keys that contain "ref_"
    dfs = {k: v for k, v in dfs.items() if "ref_" not in k}

    # Identify keys containing "district_type"
    district_type_keys = [k for k in dfs.keys() if "district_type" in k]

    # Move keys containing "district_type" to the end
    for key in district_type_keys:
        district_type_data = dfs.pop(key)  # Remove it
        dfs[key] = district_type_data  # Reinsert at the end

    return dfs

In [55]:
def merge_data_frames(dfs):
    """
    Merges multiple pandas DataFrames stored in a dictionary.

    - Uses the first DataFrame as the base for left joins.
    - Drops shared columns (except for DISTRICT_id) before merging.
    - If a DataFrame key contains "district_type", it merges using "DISTRICT_id" and "District Number".

    Parameters:
        dfs (dict): A dictionary where keys are dataset names and values are pandas DataFrames.

    Returns:
        pd.DataFrame: The merged DataFrame.
    """

    # Identifying shared columns (excluding DISTRICT_id)
    shared_columns = ['DISTRICT', 'DISTNAME', 'COUNTY', 'CNTYNAME', 'REGION', 'DFLCHART', 'DFLALTED', 'D_RATING', 'OUTCOME', 'ASVAB_STATUS', 'asvab_status',
                      'DAD_POST', 'District Name']

    # Using the first DataFrame as the base
    merge_df = list(dfs.values())[0]
    print(f"Initial merge_df shape: {merge_df.shape}")
    print(f"Initial Merge NA columns: {len(list(merge_df.columns[merge_df.isna().all()]))}")

    # Left joining the remaining DataFrames on the base DataFrame
    for key in list(dfs.keys())[1:]:
        df_to_be_merged = dfs[key].drop(columns=shared_columns, errors='ignore')

        print(f"\nMerging {key}:")
        print(f"  - Shape of merge_df before merge: {merge_df.shape}")
        print(f"  - Shape of df_to_be_merged: {df_to_be_merged.shape}")
        print(f"  - Number of all NA columns in df_to_be_merged: {len(list(df_to_be_merged.columns[df_to_be_merged.isna().all()]))}")


        if "district_type" not in key:
            # Default left join
            merge_df = merge_df.merge(df_to_be_merged, on="DISTRICT_id", how="left")
            print(f"  - Merged with LEFT join. New shape: {merge_df.shape}")
            print(f"  - Number of all NA columns in df_to_be_merged: {len(list(merge_df.columns[merge_df.isna().all()]))}")


        else:
            # Merge on both "DISTRICT_id" and "District Number"
            merge_df = merge_df.merge(df_to_be_merged, left_on="DISTRICT_id", right_on="District Number", how="left")
            print(f"  - Merged with LEFT join on 'DISTRICT_id' and 'District Number'. New shape: {merge_df.shape}")

    return merge_df


In [57]:
df_2020 = load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.3Data2020\District\clean_data")
merged_df_2020 = merge_data_frames(df_2020)

Initial merge_df shape: (1202, 879)
Initial Merge NA columns: 23

Merging distperf_2020_clean:
  - Shape of merge_df before merge: (1202, 879)
  - Shape of df_to_be_merged: (1202, 4169)
  - Number of all NA columns in df_to_be_merged: 6
  - Merged with LEFT join. New shape: (1202, 5047)
  - Number of all NA columns in df_to_be_merged: 29

Merging distprof_2020_clean:
  - Shape of merge_df before merge: (1202, 5047)
  - Shape of df_to_be_merged: (1202, 395)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1202, 5441)
  - Number of all NA columns in df_to_be_merged: 29

Merging diststaar1_2020_clean:
  - Shape of merge_df before merge: (1202, 5441)
  - Shape of df_to_be_merged: (1202, 2143)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1202, 7583)
  - Number of all NA columns in df_to_be_merged: 29

Merging district_type2020_clean:
  - Shape of merge_df before merge: (1202, 7583)
  - Shape of df_to_be_me

In [58]:
merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.4Data2021\District\clean_data"))

Initial merge_df shape: (1204, 952)
Initial Merge NA columns: 35

Merging distperf1_2021_clean:
  - Shape of merge_df before merge: (1204, 952)
  - Shape of df_to_be_merged: (1204, 2003)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1204, 2954)
  - Number of all NA columns in df_to_be_merged: 35

Merging distperf2_2021_clean:
  - Shape of merge_df before merge: (1204, 2954)
  - Shape of df_to_be_merged: (1204, 1289)
  - Number of all NA columns in df_to_be_merged: 10
  - Merged with LEFT join. New shape: (1204, 4242)
  - Number of all NA columns in df_to_be_merged: 45

Merging distprof_2021_clean:
  - Shape of merge_df before merge: (1204, 4242)
  - Shape of df_to_be_merged: (1204, 417)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1204, 4658)
  - Number of all NA columns in df_to_be_merged: 45

Merging diststaar1_2021_clean:
  - Shape of merge_df before merge: (1204, 4658)
  - Shape of df_to_be_mer

ValueError: You are trying to merge on object and int64 columns for key 'DISTRICT_id'. If you wish to proceed you should use pd.concat

In [59]:
merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.5Data2022\District\clean_data"))

Initial merge_df shape: (1207, 953)
Initial Merge NA columns: 32

Merging distperf1_2022_clean:
  - Shape of merge_df before merge: (1207, 953)
  - Shape of df_to_be_merged: (1207, 2031)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1207, 2983)
  - Number of all NA columns in df_to_be_merged: 32

Merging distperf2_2022_clean:
  - Shape of merge_df before merge: (1207, 2983)
  - Shape of df_to_be_merged: (1207, 1274)
  - Number of all NA columns in df_to_be_merged: 8
  - Merged with LEFT join. New shape: (1207, 4256)
  - Number of all NA columns in df_to_be_merged: 40

Merging distprof_2022_clean:
  - Shape of merge_df before merge: (1207, 4256)
  - Shape of df_to_be_merged: (1207, 428)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1207, 4683)
  - Number of all NA columns in df_to_be_merged: 40

Merging diststaar1_2022_clean:
  - Shape of merge_df before merge: (1207, 4683)
  - Shape of df_to_be_merg

ValueError: You are trying to merge on object and int64 columns for key 'DISTRICT_id'. If you wish to proceed you should use pd.concat

In [60]:
merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.6Data2023\District\clean_data"))

Initial merge_df shape: (1209, 950)
Initial Merge NA columns: 37

Merging distperf1_2023_clean:
  - Shape of merge_df before merge: (1209, 950)
  - Shape of df_to_be_merged: (1209, 2003)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1209, 2952)
  - Number of all NA columns in df_to_be_merged: 37

Merging distperf2_2023_clean:
  - Shape of merge_df before merge: (1209, 2952)
  - Shape of df_to_be_merged: (1209, 1289)
  - Number of all NA columns in df_to_be_merged: 4
  - Merged with LEFT join. New shape: (1209, 4240)
  - Number of all NA columns in df_to_be_merged: 41

Merging distprof_2023_clean:
  - Shape of merge_df before merge: (1209, 4240)
  - Shape of df_to_be_merged: (1209, 417)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1209, 4656)
  - Number of all NA columns in df_to_be_merged: 41

Merging diststaar1_2023_clean:
  - Shape of merge_df before merge: (1209, 4656)
  - Shape of df_to_be_merg

ValueError: You are trying to merge on object and int64 columns for key 'DISTRICT_id'. If you wish to proceed you should use pd.concat