In [1]:
import pandas as pd

cmf = pd.read_stata("CMF/CMF_1860.dta")

avg_wage_female_check = pd.read_csv("check/1860/avg_wage_female_check_1860.csv")
avg_wage_female_output_check = pd.read_csv("check/1860/avg_wage_female&output_check_1860.csv")
avg_wage_male_check = pd.read_csv("check/1860/avg_wage_male_check_1860.csv")
avg_wage_male_output_check = pd.read_csv("check/1860/avg_wage_male&output_check_1860.csv")
capital_check = pd.read_csv("check/1860/capital_check_1860.csv")
capital_materials_value_check = pd.read_csv("check/1860/capital&materials_value_check_1860.csv")
capital_output_check = pd.read_csv("check/1860/capital&output_check_1860.csv")
hands_female_check = pd.read_csv("check/1860/hands_female_check_1860.csv")
hands_female_avg_wage_female_check = pd.read_csv("check/1860/hands_female&avg_wage_female_check_1860.csv")
hands_male_avg_wage_male_check = pd.read_csv("check/1860/hands_male&avg_wage_male_check_1860.csv")
hands_male_check = pd.read_csv("check/1860/hands_male_check_1860.csv")
materials_value_check = pd.read_csv("check/1860/materials_value_check_1860.csv")
materials_value_capital_check = pd.read_csv("check/1860/materials_value&capital_check_1860.csv")
materials_value_output_check = pd.read_csv("check/1860/materials_value&output_check_1860.csv")
output_check = pd.read_csv("check/1860/output_check_1860.csv")
output_capital_check = pd.read_csv("check/1860/output&capital_check_1860.csv")
output_materials_value_check = pd.read_csv("check/1860/output&materials_value_check_1860.csv")

In [2]:
# Set a multi-index on the DataFrame using 'file_name' and 'firm_number'
cmf.set_index(["file_name", "firm_number"], inplace=True)

In [3]:
def update_cmf(cmf, check_df, update_mapping, condition_col='transcription_error', condition_val=1):
    """
    Update the target DataFrame (cmf) using the corrections in check_df.
    
    Parameters:
      cmf (DataFrame): The DataFrame to update (with a multi-index of file_name and firm_number).
      check_df (DataFrame): The DataFrame containing corrections.
      update_mapping (dict): A dictionary mapping target column names in cmf to source column names in check_df.
      condition_col (str): The column in check_df that must equal condition_val for the row to be processed.
      condition_val: The value in condition_col that indicates a correction should be applied.
    """
    # Filter for rows that need to be updated
    updates = check_df[check_df[condition_col] == condition_val]
    for _, row in updates.iterrows():
        key = (row['file_name'], row['firm_number'])
        for target_col, source_col in update_mapping.items():
            value = row[source_col]
            # Update only if the value is not NaN
            if pd.notna(value):
                cmf.loc[key, target_col] = value

In [4]:
# List of base variables from filenames
base_vars = [
    "avg_wage_female", "avg_wage_female&output", "avg_wage_male", "avg_wage_male&output",
    "capital", "capital&materials_value", "capital&output",
    "hands_female", "hands_female&avg_wage_female", "hands_male&avg_wage_male", "hands_male",
    "materials_value", "materials_value&capital", "materials_value&output",
    "output", "output&capital", "output&materials_value"
]

# Loop and apply update_cmf
for base in base_vars:
    # Create DataFrame variable name, e.g. "capital&output" -> "capital_output_check"
    df_var_name = base.replace("&", "_") + "_check"
    df = globals().get(df_var_name)
    
    if df is not None:
        update_mapping = {col: f"correct_{col}" for col in base.split("&")}
        update_cmf(cmf, df, update_mapping)
    else:
        print(f"Warning: {df_var_name} not found.")

In [None]:
# save the updated DataFrame to a new Stata file
cmf.reset_index(inplace=True)
cmf.to_stata("CMF/CMF_1860_updated.dta", write_index=False)