This code document will read in clean district datasets and merge it all into one yearly dataset. 

In [None]:
#Loading in necessary packages 
import pandas as pd
import os

In [None]:
def load_cleaned_datasets(directory: str) -> dict:
    """
    Loads all CSV files from the given directory into a dictionary of pandas DataFrames.
    
    - Removes any dataset whose filename contains "ref_".
    - Moves any dataset whose filename contains "district_type" to the end of the dictionary. (Helps with merging)
    
    Parameters:
        directory (str): Path to the directory containing the CSV files.
    
    Returns:
        dict: A dictionary where keys are filenames (without .csv) and values are pandas DataFrames.
    """
    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.endswith(".csv")]

    # Load each CSV into a dictionary with filename (without extension) as key
    dfs = {os.path.splitext(f)[0]: pd.read_csv(os.path.join(directory, f)) for f in csv_files}

    # Remove keys that contain "ref_"
    dfs = {k: v for k, v in dfs.items() if "ref_" not in k}

    # Identify keys containing "district_type"
    district_type_keys = [k for k in dfs.keys() if "district_type" in k]

    # Move keys containing "district_type" to the end
    for key in district_type_keys:
        district_type_data = dfs.pop(key)  # Remove it
        dfs[key] = district_type_data  # Reinsert at the end

    return dfs

In [None]:
def merge_data_frames(dfs):
    """
    Merges multiple pandas DataFrames stored in a dictionary.

    - Uses the first DataFrame as the base for left joins.
    - Drops shared columns (except for DISTRICT_id) before merging.
    - If a DataFrame key contains "district_type", it merges using "DISTRICT_id" and "District Number".

    Parameters:
        dfs (dict): A dictionary where keys are dataset names and values are pandas DataFrames.

    Returns:
        pd.DataFrame: The merged DataFrame.
    """

    # Identifying shared columns (excluding DISTRICT_id)
    shared_columns = ['DISTRICT', 'DISTNAME', 'COUNTY', 'CNTYNAME', 'REGION', 'DFLCHART', 'DFLALTED', 'D_RATING', 'OUTCOME', 'ASVAB_STATUS']

    # Using the first DataFrame as the base
    merge_df = list(dfs.values())[0]
    print(f"Initial merge_df shape: {merge_df.shape}")

    # Left joining the remaining DataFrames on the base DataFrame
    for key in list(dfs.keys())[1:]:
        df_to_be_merged = dfs[key].drop(columns=shared_columns, errors='ignore')

        print(f"\nMerging {key}:")
        print(f"  - Shape of merge_df before merge: {merge_df.shape}")
        print(f"  - Shape of df_to_be_merged: {df_to_be_merged.shape}")

        if "district_type" not in key:
            # Default left join
            merge_df = merge_df.merge(df_to_be_merged, on="DISTRICT_id", how="left")
            print(f"  - Merged with LEFT join. New shape: {merge_df.shape}")

        else:
            # Merge on both "DISTRICT_id" and "District Number"
            merge_df = merge_df.merge(df_to_be_merged, left_on="DISTRICT_id", right_on="District Number", how="left")
            print(f"  - Merged with LEFT join on 'DISTRICT_id' and 'District Number'. New shape: {merge_df.shape}")

    return merge_df


In [38]:
df_2020 = load_cleaned_datasets(r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.3Data2020\District\clean_data")
merge_data_frames(df_2020)

Initial merge_df shape: (1202, 879)

Merging distperf_2020_clean:
  - Shape of merge_df before merge: (1202, 879)
  - Shape of df_to_be_merged: (1202, 4169)
  - Merged with LEFT join. New shape: (1202, 5047)

Merging distprof_2020_clean:
  - Shape of merge_df before merge: (1202, 5047)
  - Shape of df_to_be_merged: (1202, 395)
  - Merged with LEFT join. New shape: (1202, 5441)

Merging diststaar1_2020_clean:
  - Shape of merge_df before merge: (1202, 5441)
  - Shape of df_to_be_merged: (1202, 2143)
  - Merged with LEFT join. New shape: (1202, 7583)

Merging district_type2020_clean:
  - Shape of merge_df before merge: (1202, 7583)
  - Shape of df_to_be_merged: (1212, 7)
  - Merged with LEFT join on 'DISTRICT_id' and 'District Number'. New shape: (1202, 7590)


Unnamed: 0,DISTRICT_id,District 2019 Attendance: All Students Days Present,District 2019 Attendance: Two or More Races Days Present,District 2019 Attendance: Asian Days Present,District 2019 Attendance: Pacific Islander Days Present,District 2019 Attendance: African American Days Present,District 2019 Attendance: Hispanic Days Present,District 2019 Attendance: White Days Present,District 2019 Attendance: American Indian Days Present,District 2019 Attendance: Econ Disadv Days Present,...,"District 2019 Domain 1A: Approaches Grade Level STD, Grade 8, Mobile, STAAR Social Studies Rate","District 2019 Domain 1A: Meets Grade Level STD, Grade 8, Mobile, STAAR Social Studies Rate","District 2019 Domain 1A: Masters Grade Level STD, Grade 8, Mobile, STAAR Social Studies Rate",District,District Number,TEA District Type,TEA Description,NCES District Type,NCES Description,Charter School (Y/N)
0,1902,85710.0,3937.0,,,2334.0,5161.0,73937.0,,34344.0,...,64.0,45.0,27.0,CAYUGA ISD,1902,H,Rural,43,Rural-Remote,N
1,1903,184643.0,7248.0,,,9172.0,21333.0,145047.0,,99769.0,...,66.0,24.0,14.0,ELKHART ISD,1903,G,Non-metropolitan Stable,42,Rural-Distant,N
2,1904,114641.0,4716.0,985.0,,10325.0,8988.0,88796.0,,57587.0,...,67.0,33.0,27.0,FRANKSTON ISD,1904,H,Rural,42,Rural-Distant,N
3,1906,51701.5,1684.0,,,4186.0,8175.0,37491.5,,22872.0,...,100.0,29.0,0.0,NECHES ISD,1906,H,Rural,42,Rural-Distant,N
4,1907,494346.0,16963.0,4340.0,,130752.0,205375.0,136085.0,,358567.0,...,55.0,25.0,17.0,PALESTINE ISD,1907,E,Independent Town,32,Town-Distant,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,252902,31052.0,,,,,6393.0,23998.0,,19217.0,...,,,,NEWCASTLE ISD,252902,H,Rural,42,Rural-Distant,N
1198,252903,96169.0,2560.0,,,2198.0,38201.0,51908.0,,63246.0,...,80.0,60.0,60.0,OLNEY ISD,252903,H,Rural,32,Town-Distant,N
1199,253901,489197.5,,,,,484414.5,3817.0,,410504.5,...,78.0,33.0,11.0,ZAPATA COUNTY ISD,253901,D,Other Central City Suburban,33,Town-Remote,N
1200,254901,264514.0,,,,1026.0,260719.0,,1123.0,215490.0,...,59.0,27.0,9.0,CRYSTAL CITY ISD,254901,G,Non-metropolitan Stable,33,Town-Remote,N


In [32]:
### READING IN THE DATASETS ### 
base_dir = r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.3Data2020\District\clean_data"

# Define base directory
base_dir = r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.3Data2020\District\clean_data"

# List all CSV files in the directory
csv_files = [f for f in os.listdir(base_dir) if f.endswith(".csv")]

# Load each CSV into a dictionary with filename (without extension) as key
dfs = {os.path.splitext(f)[0]: pd.read_csv(os.path.join(base_dir, f)) for f in csv_files}

print(dfs.keys)

# Identifying shared columns (excluding DISTRICT)
shared_columns = ['DISTRICT', 'DISTNAME', 'COUNTY', 'CNTYNAME', 'REGION', 'DFLCHART', 'DFLALTED', 'D_RATING', 'OUTCOME', 'ASVAB_STATUS']

base_df = dfs["distgrad"]
print(base_df.shape)
#print(base_df.shape)
#print(dfs["diststaar"].shape)
#print(dfs["diststaar"].drop(columns = shared_columns, errors = 'ignore').shape)
merge_df = dfs["diststaar"].drop(columns = shared_columns, errors = 'ignore')
print(merge_df.shape)
merge1 = base_df.merge(merge_df, on="DISTRICT_id", how = "left")
print(merge1.shape)




<built-in method keys of dict object at 0x0000019015A08900>


KeyError: 'distgrad'

In [15]:
dfs.keys()

dict_keys(['distgrad', 'distperf', 'distprof', 'diststaar', 'disttype'])

In [4]:
import pandas as pd
import os

# Define the base directory for your local files
base_dir = r"C:\Users\mmath\OneDrive\Desktop\Capstone\HERC_Sp25\0_Datasets\1.3Data2020\District\clean_data"

# Define file names
file_paths = {
    "distgrad": "distgrad_2020_clean.csv",
    "distperf": "distperf_2020_clean.csv",
    "distprof": "distprof_2020_clean.csv",
    "diststaar": "diststaar1_2020_clean.csv",
    "disttype": "district_type2020_clean.csv"
}

# Construct full file paths
file_full_paths = {key: os.path.join(base_dir, file) for key, file in file_paths.items()}

# Load datasets from local directory
dfs = {key: pd.read_csv(path) for key, path in file_full_paths.items()}

# Identifying shared columns (excluding DISTRICT)
shared_columns = ['DISTNAME', 'COUNTY', 'CNTYNAME', 'REGION', 'DFLCHART', 'DFLALTED', 'D_RATING', 'OUTCOME', 'ASVAB_STATUS']

# Merge files 1-4, keeping shared columns only from the first dataset
base_df = dfs["distgrad"]
for key in ["distperf", "distprof", "diststaar"]:
    dfs[key] = dfs[key].drop(columns=shared_columns, errors='ignore')
    base_df = base_df.merge(dfs[key], on="DISTRICT", how="outer")

# Merge the district type dataset
final_df = base_df.merge(dfs["disttype"], on="DISTRICT", how="outer")

# Save locally
output_path = os.path.join(base_dir, "merged_district_data_2020.csv")
final_df.to_csv(output_path, index=False)

# Display the merged DataFrame
import ace_tools as tools
tools.display_dataframe_to_user(name="Merged District Data", dataframe=final_df)

print(f"Merged dataset saved at: {output_path}")


MergeError: Passing 'suffixes' which cause duplicate columns {'DISTRICT_id_x'} is not allowed.

In [None]:
https://raw.githubusercontent.com/RiceD2KLab/HERC_Sp25/refs/heads/main/0_Datasets/1.3Data2020/District/clean_data/distgrad_2020_clean.csv?token=GHSAT0AAAAAAC6VGBY2XAZJTTM2V4W724DCZ6PHVZA