In [None]:
import pandas as pd

data = pd.read_csv("src/difference_in_difference_with_deep_learning/data/testdata.csv")

In [None]:
import yaml

with open(
    "src/difference_in_difference_with_deep_learning/data_management/data_info.yaml",
) as file:
    data_info = yaml.safe_load(file)

In [None]:
data_select_1 = data[data_info["categorical_columns"][1]]
data_select_2 = data[data_info["categorical_columns"][2]]

In [None]:
data.loc[:, "interaction"] = data_select_1 * data_select_2

In [None]:
import statsmodels.formula.api as smf

In [None]:
def run_regression(data):
    formula = "wage_year ~ interaction + Individual + FQ + Reform + Age + WagePartner"
    reg_joint = smf.ols(formula=formula, data=data)
    results_joint = reg_joint.fit()
    return results_joint.summary()

In [None]:
summary = run_regression(data)
print(summary)

In [None]:
for table in summary.tables:
    print(table.as_latex_tabular())

In [None]:
# Extract the text of the summary
summary_text = regression_summary.as_text()

# Find the index where you want to cut off
cut_off_index = summary_text.find("Omnibus:")

# Keep the portion of the summary before the cut-off index
trimmed_summary_text = summary_text[:cut_off_index]
trimmed_summary_text

# Groupby richtig machen

In [None]:
import pandas as pd
import statsmodels.formula.api as smf


def estimate_regression(
    data,
    dependent_variable,
    treatment_variable,
    control_variables,
    time_periods,
):
    """Estimate regression models for each time period and summarize the results.

    Parameters:
        data (DataFrame): The dataset containing all variables.
        dependent_variable (str): The name of the dependent variable.
        treatment_variable (str): The name of the treatment variable.
        control_variables (list): A list of names of control variables.
        time_periods (list): A list of time periods for analysis.

    Returns:
        DataFrame: A DataFrame containing summary statistics for each time period.
    """
    results_list = []

    for time_period in time_periods:
        # Filter the data for the current time period
        data_time_period = data[data["time"] == time_period]

        # Define the regression formula
        formula = f"{dependent_variable} ~ {treatment_variable} + {' + '.join(control_variables)}"

        # Fit the regression model
        reg_model = smf.ols(formula=formula, data=data_time_period)
        results = reg_model.fit()

        # Extract coefficient, std. error, and p-value
        coefficient = results.params[treatment_variable]
        std_error = results.bse[treatment_variable]
        p_value = results.pvalues[treatment_variable]

        # Determine significance level based on p-value
        if p_value < 0.001:
            significance = "***"
        elif p_value < 0.01:
            significance = "**"
        elif p_value < 0.05:
            significance = "*"
        else:
            significance = ""

        # Calculate the control mean
        control_mean = data_time_period[dependent_variable].mean()

        # Calculate the difference between treatment and control groups
        difference_tc = (
            data_time_period[data_time_period[treatment_variable] == 1][
                dependent_variable
            ].mean()
            - control_mean
        )

        # Calculate the difference with controls
        # Assume mean of interaction term for simplicity
        data_time_period["interaction"].mean()
        difference_tc_controls = (
            difference_tc
            - coefficient
            * (
                data_time_period[control_variables]
                - data_time_period[control_variables].mean()
            )
            .mean()
            .sum()
        )

        # Append the results to the list
        results_list.append(
            {
                "Time Period": time_period,
                "Control Mean": control_mean,
                "Difference T-C": difference_tc,
                "Difference T-C with Controls": difference_tc_controls,
                "Coefficient": coefficient,
                "Std. Error": std_error,
                "P-value": p_value,
                "Significance": significance,
            },
        )

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(results_list)


# Example usage:

In [None]:
dependent_variable = "wage_year"
treatment_variable = "interaction"
control_variables = ["FQ", "Reform", "WagePartner", "Individual"]
time_periods = ["t-2", "t-1", "t+1", "t+2", "t+3"]

# Call the function
results = estimate_regression(
    data,
    dependent_variable,
    treatment_variable,
    control_variables,
    time_periods,
)

# Print the results table
print(results)