This code document will read in clean district datasets and merge it all into one yearly dataset. 

In [61]:
#Loading in necessary packages 
import pandas as pd
import os

In [62]:
def load_cleaned_datasets(directory: str) -> dict:
    """
    Loads all CSV files from the given directory into a dictionary of pandas DataFrames.
    
    - Removes any dataset whose filename contains "ref_".
    - Moves any dataset whose filename contains "district_type" to the end of the dictionary. (Helps with merging)
    
    Parameters:
        directory (str): Path to the directory containing the CSV files.
    
    Returns:
        dict: A dictionary where keys are filenames (without .csv) and values are pandas DataFrames.
    """
    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

    # Load each CSV into a dictionary with filename (without extension) as key
    dfs = {os.path.splitext(f)[0]: pd.read_csv(os.path.join(directory, f)) for f in csv_files}

    # Remove keys that contain "ref_"
    dfs = {k: v for k, v in dfs.items() if "ref_" not in k}

    # Identify keys containing "district_type"
    district_type_keys = [k for k in dfs.keys() if "district_type" in k]

    # Move keys containing "district_type" to the end
    for key in district_type_keys:
        district_type_data = dfs.pop(key)  # Remove it
        dfs[key] = district_type_data  # Reinsert at the end

    return dfs

In [83]:
def merge_data_frames(dfs):
    """
    Merges multiple pandas DataFrames stored in a dictionary.

    - Uses the first DataFrame as the base for left joins.
    - Drops shared columns (except for DISTRICT_id) before merging.
    - If a DataFrame key contains "district_type", it merges using "DISTRICT_id" and "District Number".

    Parameters:
        dfs (dict): A dictionary where keys are dataset names and values are pandas DataFrames.

    Returns:
        pd.DataFrame: The merged DataFrame.
    """

    # Identifying shared columns (excluding DISTRICT_id)
    shared_columns = ['DISTRICT', 'DISTNAME', 'COUNTY', 'CNTYNAME', 'REGION', 'DFLCHART', 'DFLALTED', 'D_RATING', 'OUTCOME', 'ASVAB_STATUS', 'asvab_status',
                      'DAD_POST', 'District Name']

    # Using the first DataFrame as the base
    merge_df = list(dfs.values())[0]
    print(f"Initial merge_df shape: {merge_df.shape}")
    print(f"Initial Merge NA columns: {len(list(merge_df.columns[merge_df.isna().all()]))}")

    # Left joining the remaining DataFrames on the base DataFrame
    for key in list(dfs.keys())[1:]:
        df_to_be_merged = dfs[key].drop(columns=shared_columns, errors='ignore')

        print(f"\nMerging {key}:")
        print(f"  - Shape of merge_df before merge: {merge_df.shape}")
        print(f"  - Shape of df_to_be_merged: {df_to_be_merged.shape}")
        print(f"  - Number of all NA columns in df_to_be_merged: {len(list(df_to_be_merged.columns[df_to_be_merged.isna().all()]))}")


        if "district_type" not in key:
            # Default left join
            merge_df = merge_df.merge(df_to_be_merged, on="DISTRICT_id", how="left")
            print(f"  - Merged with LEFT join. New shape: {merge_df.shape}")
            print(f"  - Number of all NA columns in df_to_be_merged: {len(list(merge_df.columns[merge_df.isna().all()]))}")


        else:
            # Merge on both "DISTRICT_id" and "District Number"
            # Ensure both columns are integers for proper merging
            # Ensure DISTRICT_id is properly cleaned and converted to integers
            merge_df['DISTRICT_id'] = merge_df['DISTRICT_id'].astype(str).str.replace(r"[^\d]", "", regex=True).astype(int)

            # Ensure District Number is an integer
            df_to_be_merged["District Number"] = df_to_be_merged["District Number"].astype(int)
            merge_df = merge_df.merge(df_to_be_merged, left_on="DISTRICT_id", right_on="District Number", how="left")
            print(f"  - Merged with LEFT join on 'DISTRICT_id' and 'District Number'. New shape: {merge_df.shape}")

    return merge_df


In [None]:
def save_merged_dataframe(merged_df, output_dir, filename):
    """
    Saves the merged DataFrame to a specified directory as a CSV file.

    Parameters:
        merged_df (pd.DataFrame): The DataFrame to be saved.
        output_dir (str): The directory where the CSV file will be saved.
        filename (str): The name of the output CSV file (default: "merged_data.csv").

    Returns:
        str: The full path of the saved CSV file.
    """
    # Ensure the directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Construct full file path
    file_path = os.path.join(output_dir, filename)

    # Save DataFrame as CSV
    merged_df.to_csv(file_path, index=False)

    print(f"✅ Merged DataFrame saved successfully at: {file_path}")
    return file_path

In [93]:

save_merged_dataframe(merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.4Data2021\District\clean_data")),
                      r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.7Master_Files\Individual Year Files_Take2",
                      "merged_2021.csv")

save_merged_dataframe(merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.5Data2022\District\clean_data")),
                      r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.7Master_Files\Individual Year Files_Take2",
                      "merged_2022.csv")

save_merged_dataframe(merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.6Data2023\District\clean_data")),
                      r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.7Master_Files\Individual Year Files_Take2",
                      "merged_2023.csv")

Initial merge_df shape: (1204, 952)
Initial Merge NA columns: 35

Merging distperf1_2021_clean:
  - Shape of merge_df before merge: (1204, 952)
  - Shape of df_to_be_merged: (1204, 2003)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1204, 2954)
  - Number of all NA columns in df_to_be_merged: 35

Merging distperf2_2021_clean:
  - Shape of merge_df before merge: (1204, 2954)
  - Shape of df_to_be_merged: (1204, 1289)
  - Number of all NA columns in df_to_be_merged: 10
  - Merged with LEFT join. New shape: (1204, 4242)
  - Number of all NA columns in df_to_be_merged: 45

Merging distprof_2021_clean:
  - Shape of merge_df before merge: (1204, 4242)
  - Shape of df_to_be_merged: (1204, 417)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1204, 4658)
  - Number of all NA columns in df_to_be_merged: 45

Merging diststaar1_2021_clean:
  - Shape of merge_df before merge: (1204, 4658)
  - Shape of df_to_be_mer

'C:\\Users\\mmath\\OneDrive\\Desktop\\Capstone\\HERC_Sp25\\0_Datasets\\1.7Master_Files\\Individual Year Files_Take2\\merged_2023.csv'

In [85]:
df_2020 = load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.3Data2020\District\clean_data")
merged_df_2020 = merge_data_frames(df_2020)

Initial merge_df shape: (1202, 879)
Initial Merge NA columns: 23

Merging distperf_2020_clean:
  - Shape of merge_df before merge: (1202, 879)
  - Shape of df_to_be_merged: (1202, 4169)
  - Number of all NA columns in df_to_be_merged: 6
  - Merged with LEFT join. New shape: (1202, 5047)
  - Number of all NA columns in df_to_be_merged: 29

Merging distprof_2020_clean:
  - Shape of merge_df before merge: (1202, 5047)
  - Shape of df_to_be_merged: (1202, 395)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1202, 5441)
  - Number of all NA columns in df_to_be_merged: 29

Merging diststaar1_2020_clean:
  - Shape of merge_df before merge: (1202, 5441)
  - Shape of df_to_be_merged: (1202, 2143)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1202, 7583)
  - Number of all NA columns in df_to_be_merged: 29

Merging district_type2020_clean:
  - Shape of merge_df before merge: (1202, 7583)
  - Shape of df_to_be_me

In [86]:
merge_data_2021 = merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.4Data2021\District\clean_data"))
print(merge_data_2021)

Initial merge_df shape: (1204, 952)
Initial Merge NA columns: 35

Merging distperf1_2021_clean:
  - Shape of merge_df before merge: (1204, 952)
  - Shape of df_to_be_merged: (1204, 2003)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1204, 2954)
  - Number of all NA columns in df_to_be_merged: 35

Merging distperf2_2021_clean:
  - Shape of merge_df before merge: (1204, 2954)
  - Shape of df_to_be_merged: (1204, 1289)
  - Number of all NA columns in df_to_be_merged: 10
  - Merged with LEFT join. New shape: (1204, 4242)
  - Number of all NA columns in df_to_be_merged: 45

Merging distprof_2021_clean:
  - Shape of merge_df before merge: (1204, 4242)
  - Shape of df_to_be_merged: (1204, 417)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1204, 4658)
  - Number of all NA columns in df_to_be_merged: 45

Merging diststaar1_2021_clean:
  - Shape of merge_df before merge: (1204, 4658)
  - Shape of df_to_be_mer

In [87]:
merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.5Data2022\District\clean_data"))

Initial merge_df shape: (1207, 953)
Initial Merge NA columns: 32

Merging distperf1_2022_clean:
  - Shape of merge_df before merge: (1207, 953)
  - Shape of df_to_be_merged: (1207, 2031)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1207, 2983)
  - Number of all NA columns in df_to_be_merged: 32

Merging distperf2_2022_clean:
  - Shape of merge_df before merge: (1207, 2983)
  - Shape of df_to_be_merged: (1207, 1274)
  - Number of all NA columns in df_to_be_merged: 8
  - Merged with LEFT join. New shape: (1207, 4256)
  - Number of all NA columns in df_to_be_merged: 40

Merging distprof_2022_clean:
  - Shape of merge_df before merge: (1207, 4256)
  - Shape of df_to_be_merged: (1207, 428)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1207, 4683)
  - Number of all NA columns in df_to_be_merged: 40

Merging diststaar1_2022_clean:
  - Shape of merge_df before merge: (1207, 4683)
  - Shape of df_to_be_merg

Unnamed: 0,DISTRICT_id,District 2021 Attendance: All Students Days Present,District 2021 Attendance: Two or More Races Days Present,District 2021 Attendance: Asian Days Present,District 2021 Attendance: Pacific Islander Days Present,District 2021 Attendance: African American Days Present,District 2021 Attendance: Hispanic Days Present,District 2021 Attendance: White Days Present,District 2021 Attendance: American Indian Days Present,District 2021 Attendance: Econ Disadv Days Present,...,"District 2022 Domain 1A: Approaches Grade Level STD, Grade 8, Mobile, STAAR Science Rate","District 2022 Domain 1A: Meets Grade Level STD, Grade 8, Mobile, STAAR Science Rate","District 2022 Domain 1A: Masters Grade Level STD, Grade 8, Mobile, STAAR Science Rate",District,District Number,TEA District Type,TEA Description,NCES District Type,NCES Description,Charter School (Y/N)
0,1902,79610.0,3478.0,,,2720.0,8225.0,64692.0,,29003.0,...,79.0,57.0,36.0,CAYUGA ISD,1902,H,Rural,43,Rural-Remote,N
1,1903,171127.5,7208.0,,,6869.0,20576.0,135457.5,,78127.5,...,92.0,77.0,31.0,ELKHART ISD,1903,G,Non-metropolitan Stable,42,Rural-Distant,N
2,1904,105759.0,3861.0,1171.0,,9680.0,9415.0,81129.0,,54948.0,...,79.0,21.0,5.0,FRANKSTON ISD,1904,H,Rural,42,Rural-Distant,N
3,1906,45015.5,1623.5,,,3449.0,5466.0,34150.0,,20629.5,...,50.0,33.0,33.0,NECHES ISD,1906,H,Rural,42,Rural-Distant,N
4,1907,487468.5,17480.0,3844.0,,122201.0,209236.0,132881.5,1472.0,377020.0,...,75.0,50.0,35.0,PALESTINE ISD,1907,E,Independent Town,32,Town-Distant,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202,252902,29058.0,,,,,6499.0,21833.0,,18070.0,...,80.0,60.0,40.0,NEWCASTLE ISD,252902,H,Rural,42,Rural-Distant,N
1203,252903,92444.0,2016.0,,,2395.0,33024.0,53722.0,,49091.0,...,82.0,27.0,0.0,OLNEY ISD,252903,H,Rural,32,Town-Distant,N
1204,253901,478905.0,,,,,475015.0,3146.0,,414169.0,...,50.0,25.0,0.0,ZAPATA COUNTY ISD,253901,D,Other Central City Suburban,33,Town-Remote,N
1205,254901,230574.0,,,,853.0,227369.0,1227.0,,184913.0,...,52.0,10.0,5.0,CRYSTAL CITY ISD,254901,G,Non-metropolitan Stable,33,Town-Remote,N


In [88]:
merge_data_frames(load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.6Data2023\District\clean_data"))

Initial merge_df shape: (1209, 950)
Initial Merge NA columns: 37

Merging distperf1_2023_clean:
  - Shape of merge_df before merge: (1209, 950)
  - Shape of df_to_be_merged: (1209, 2003)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1209, 2952)
  - Number of all NA columns in df_to_be_merged: 37

Merging distperf2_2023_clean:
  - Shape of merge_df before merge: (1209, 2952)
  - Shape of df_to_be_merged: (1209, 1289)
  - Number of all NA columns in df_to_be_merged: 4
  - Merged with LEFT join. New shape: (1209, 4240)
  - Number of all NA columns in df_to_be_merged: 41

Merging distprof_2023_clean:
  - Shape of merge_df before merge: (1209, 4240)
  - Shape of df_to_be_merged: (1209, 417)
  - Number of all NA columns in df_to_be_merged: 0
  - Merged with LEFT join. New shape: (1209, 4656)
  - Number of all NA columns in df_to_be_merged: 41

Merging diststaar1_2023_clean:
  - Shape of merge_df before merge: (1209, 4656)
  - Shape of df_to_be_merg

Unnamed: 0,DISTRICT_id,District 2022 Attendance: All Students Days Present,District 2022 Attendance: Two or More Races Days Present,District 2022 Attendance: Asian Days Present,District 2022 Attendance: Pacific Islander Days Present,District 2022 Attendance: African American Days Present,District 2022 Attendance: Hispanic Days Present,District 2022 Attendance: White Days Present,District 2022 Attendance: American Indian Days Present,District 2022 Attendance: Econ Disadv Days Present,...,"District 2023 Domain 1A: Approaches Grade Level STD, Grade 8, Non-Continuous Enrollee, STAAR Science Rate","District 2023 Domain 1A: Meets Grade Level STD, Grade 8, Non-Continuous Enrollee, STAAR Science Rate","District 2023 Domain 1A: Masters Grade Level STD, Grade 8, Non-Continuous Enrollee, STAAR Science Rate",District,District Number,TEA District Type,TEA Description,NCES District Type,NCES Description,Charter School (Y/N)
0,1902,84674.0,3851.0,,,3608.0,9535.5,67072.5,,33567.5,...,77.0,41.0,9.0,CAYUGA ISD,1902,H,Rural,43,Rural-Remote,N
1,1903,170556.0,5314.0,,,7320.0,20078.0,137096.0,,71204.0,...,100.0,47.0,21.0,ELKHART ISD,1903,G,Non-metropolitan Stable,42,Rural-Distant,N
2,1904,122113.0,4915.0,1592.0,,9646.0,13075.0,92399.0,,63694.0,...,75.0,33.0,0.0,FRANKSTON ISD,1904,H,Rural,42,Rural-Distant,N
3,1906,46835.0,1320.0,,,4051.0,6154.0,34981.0,,23831.0,...,83.0,50.0,0.0,NECHES ISD,1906,H,Rural,42,Rural-Distant,N
4,1907,459581.5,15932.0,3629.0,,112553.0,200648.5,125372.0,1159.0,367008.0,...,78.0,45.0,7.0,PALESTINE ISD,1907,E,Independent Town,32,Town-Distant,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,252902,29341.0,,,,,6749.0,21973.0,,18199.0,...,89.0,11.0,0.0,NEWCASTLE ISD,252902,H,Rural,42,Rural-Distant,N
1205,252903,91460.0,2027.0,,,1838.0,32352.0,53673.0,920.0,55900.0,...,85.0,46.0,15.0,OLNEY ISD,252903,H,Rural,32,Town-Distant,N
1206,253901,454489.0,,,,,450380.0,3307.0,,391770.5,...,62.0,38.0,4.0,ZAPATA COUNTY ISD,253901,D,Other Central City Suburban,33,Town-Remote,N
1207,254901,233590.5,,,,,232347.5,795.0,,190901.5,...,50.0,17.0,0.0,CRYSTAL CITY ISD,254901,G,Non-metropolitan Stable,33,Town-Remote,N
