In [1]:
import pandas as pd

cmf = pd.read_stata("CMF/CMF_1880_general_schedule.dta")

capital_check = pd.read_csv("check/1880/1880_general_schedule/capital_check_1880_general_schedule.csv")
capital_materials_value_check = pd.read_csv("check/1880/1880_general_schedule/capital&materials_value_check_1880_general_schedule.csv")
capital_output_check = pd.read_csv("check/1880/1880_general_schedule/capital&output_check_1880_general_schedule.csv")
materials_value_check = pd.read_csv("check/1880/1880_general_schedule/materials_value_check_1880_general_schedule.csv")
materials_value_capital_check = pd.read_csv("check/1880/1880_general_schedule/materials_value&capital_check_1880_general_schedule.csv")
materials_value_output_check = pd.read_csv("check/1880/1880_general_schedule/materials_value&output_check_1880_general_schedule.csv")
output_check = pd.read_csv("check/1880/1880_general_schedule/output_check_1880_general_schedule.csv")
output_capital_check = pd.read_csv("check/1880/1880_general_schedule/output&capital_check_1880_general_schedule.csv")
output_materials_value_check = pd.read_csv("check/1880/1880_general_schedule/output&materials_value_check_1880_general_schedule.csv")
total_wages_check = pd.read_csv("check/1880/1880_general_schedule/total_wages_check_1880_general_schedule.csv")
total_wages_output_check = pd.read_csv("check/1880/1880_general_schedule/total_wages&output_check_1880_general_schedule.csv")
workers_adult_female_check = pd.read_csv("check/1880/1880_general_schedule/workers_adult_female_check_1880_general_schedule.csv")
workers_adult_male_check = pd.read_csv("check/1880/1880_general_schedule/workers_adult_male_check_1880_general_schedule.csv")
workers_adult_male_workers_adult_female_workers_children_total_wages_check = pd.read_csv(
    "check/1880/1880_general_schedule/workers_adult_male&workers_adult_female&workers_children&total_wages_check_1880_general_schedule.csv"
)

In [None]:
# Set a multi-index on the DataFrame using 'file_name' and 'firm_number'
cmf.set_index(["file_name", "firm_number"], inplace=True)

In [None]:
def update_cmf(cmf, check_df, update_mapping, condition_col='transcription_error', condition_val=1):
    """
    Update the target DataFrame (cmf) using the corrections in check_df.
    
    Parameters:
      cmf (DataFrame): The DataFrame to update (with a multi-index of file_name and firm_number).
      check_df (DataFrame): The DataFrame containing corrections.
      update_mapping (dict): A dictionary mapping target column names in cmf to source column names in check_df.
      condition_col (str): The column in check_df that must equal condition_val for the row to be processed.
      condition_val: The value in condition_col that indicates a correction should be applied.
    """
    # Filter for rows that need to be updated
    updates = check_df[check_df[condition_col] == condition_val]
    for _, row in updates.iterrows():
        key = (row['file_name'], row['firm_number'])
        for target_col, source_col in update_mapping.items():
            value = row[source_col]
            # Update only if the value is not NaN
            if pd.notna(value):
                cmf.loc[key, target_col] = value

In [None]:
# List of base variables from filenames
base_vars = [
    "capital", "capital&materials_value", "capital&output",
    "materials_value", "materials_value&capital", "materials_value&output",
    "output", "output&capital", "output&materials_value",
    "total_wages", "total_wages&output",
    "workers_adult_female", "workers_adult_male",
    "workers_adult_male&workers_adult_female&workers_children&total_wages"
]

# Loop and apply update_cmf
for base in base_vars:
    # Create DataFrame variable name, e.g. "capital&output" -> "capital_output_check"
    df_var_name = base.replace("&", "_") + "_check"
    df = globals().get(df_var_name)
    
    if df is not None:
        update_mapping = {col: f"correct_{col}" for col in base.split("&")}
        update_cmf(cmf, df, update_mapping)
    else:
        print(f"Warning: {df_var_name} not found.")

In [None]:
# save the updated DataFrame to a new Stata file
cmf.reset_index(inplace=True)
cmf.to_stata("CMF/CMF_1880_general_schedule_updated.dta", write_index=False)