In [100]:
# Open the dataset CMF_1850.dta
import pandas as pd
# read .dta file
cmf = pd.read_stata('CMF_1850.dta')
# read .csv files
hands_female_checking = pd.read_csv("hands_female_checking.csv") #update_mapping1
hands_male_avg_wage_male_checking = pd.read_csv("hands_male_avg_wage_male_checking.csv") #update_mapping2
capital_check = pd.read_csv("capital_check.csv") #update_mapping3
avg_wage_female_checking = pd.read_csv("avg_wage_female_checking.csv") #update_mapping4
avg_wage_female_output_checking = pd.read_csv("avg_wage_female_output_checking.csv") #update_mapping5
capital_materials_value_check = pd.read_csv("capital_materials_value_check.csv") #update_mapping6
output_capital_checking = pd.read_csv("output_capital_checking.csv") #update_mapping7
materials_value_check = pd.read_csv('materials_value_check.csv') #skip
materials_value_output_check = pd.read_csv("materials_value_output_check.csv") #skip
output_checking = pd.read_csv("output_checking.csv") #skip
# read .xlsx files
avg_wage_male_check_EL = pd.read_excel("avg_wage_male_check_EL.xlsx") # No Transcription Error
avg_wage_male_output_check_EL = pd.read_excel("avg_wage_male_output_check_EL.xlsx") #update_mapping8

In [101]:
# Set a multi-index on the DataFrame using 'file_name' and 'firm_number'
cmf.set_index(["file_name", "firm_number"], inplace=True)

In [102]:
def update_cmf(cmf, check_df, update_mapping, condition_col='transcription_error', condition_val=1):
    """
    Update the target DataFrame (cmf) using the corrections in check_df.
    
    Parameters:
      cmf (DataFrame): The DataFrame to update (with a multi-index of file_name and firm_number).
      check_df (DataFrame): The DataFrame containing corrections.
      update_mapping (dict): A dictionary mapping target column names in cmf to source column names in check_df.
      condition_col (str): The column in check_df that must equal condition_val for the row to be processed.
      condition_val: The value in condition_col that indicates a correction should be applied.
    """
    # Filter for rows that need to be updated
    updates = check_df[check_df[condition_col] == condition_val]
    for _, row in updates.iterrows():
        key = (row['file_name'], row['firm_number'])
        for target_col, source_col in update_mapping.items():
            value = row[source_col]
            # Update only if the value is not NaN
            if pd.notna(value):
                cmf.loc[key, target_col] = value

In [103]:
update_mapping1 = {
    "hands_female": "right_value_1",
    "hands_male": "right_value_hands_male",
    "avg_wage_male": "right_value_avg_wage_male",
    "avg_wage_female": "right_value_avg_wage_female"
}

update_cmf(cmf, hands_female_checking, update_mapping1)

In [104]:
update_mapping2 = {
    "hands_male": "right_value_1",
    "avg_wage_male": "right_value_2",
    "avg_wage_female": "right_value_avg_wage_female"
}

update_cmf(cmf, hands_male_avg_wage_male_checking, update_mapping2)

In [105]:
update_mapping3 = {
    "capital": "right_value_1",
}
update_cmf(cmf, capital_check, update_mapping3)


In [106]:
update_mapping4 = {
    "avg_wage_female": "right_value_1",
    "avg_wage_male": "right_value_male"
}

update_cmf(cmf, avg_wage_female_checking, update_mapping4)

In [107]:
update_mapping5 = {
    "avg_wage_female": "right_value_1",
    # skip the output
    "avg_wage_male": "right_value_male"
}

update_cmf(cmf, avg_wage_female_output_checking, update_mapping5)

In [108]:
update_mapping6 = {
    "capital": "right_value_1"
    # skip the materials value
}

update_cmf(cmf, capital_materials_value_check, update_mapping6)

In [109]:
update_mapping7 = {
    # skip the output
    "capital": "right_value_2"
}

update_cmf(cmf, output_capital_checking, update_mapping7)

In [110]:
update_mapping8 = {
    "avg_wage_male": "right_value_1"
    # skip the output
}

update_cmf(cmf, avg_wage_male_output_check_EL, update_mapping8)

In [111]:
for col in cmf.select_dtypes(include='object').columns:
    cmf[col] = cmf[col].astype(str)
# Save with UTF-8 compatible version
cmf.to_stata('CMF_1850_updated.dta', write_index=False, version=118)
