In [None]:
import pandas as pd

data = pd.read_csv("src/difference_in_difference_with_deep_learning/data/testdata.csv")

In [None]:
import yaml

with open(
    "src/difference_in_difference_with_deep_learning/data_management/data_info.yaml",
) as file:
    data_info = yaml.safe_load(file)

In [None]:
data_select_1 = data[data_info["categorical_columns"][0]]
data_select_2 = data[data_info["categorical_columns"][1]]

In [None]:
data.loc[:, data_info["causal_effect"]] = data_select_1 * data_select_2

In [None]:
import statsmodels.formula.api as smf

In [None]:
def did_regression(data, data_info):
    # Extract variables from data_info
    outcome_variable = data_info["outcome"]
    causal_effect_variable = data_info["causal_effect"]
    categorical_columns = data_info["categorical_columns"]
    control_columns = data_info["control_columns"]

    # Create formula string
    formula = f"{outcome_variable} ~ {causal_effect_variable} + {' + '.join(categorical_columns)}  + {' + '.join(control_columns)}"

    # Set up the regression model
    reg_model = smf.ols(formula=formula, data=data)

    # Fit the regression model
    results = reg_model.fit()

    # Return the summary of the regression results
    return results.summary2()

## Summary of how to change summary output for my need (postponed)

In [None]:
summary = did_regression(data, data_info)
print(summary)

In [None]:
data0 = summary.tables[0]

In [None]:
data0

In [None]:
adj_r_squared_row = data0.loc[data0[0] == "Adj. R-squared:"]
no_observations_row = data0.loc[data0[3] == "No. Observations:"]

In [None]:
data1 = summary.tables[1]
index = ["Intercept", "interaction", "FQ", "Reform", "Age", "WagePartner"]
summary_df = pd.DataFrame(data, index=index)
summary_df = summary_df.round(3)
print(summary_df)

In [None]:
summary_df = pd.concat(
    [summary_df, adj_r_squared_row, no_observations_row],
    ignore_index=True,
)

In [None]:
summary_df

In [None]:
latex_table = summary_df.to_latex()

print(latex_table)

In [None]:
for table in summary.tables:
    print(table.as_latex_tabular())

# Groupby richtig machen

In [None]:
def estimate_regression(data, data_info):
    """Estimate regression models for each time period and summarize the results.

    Parameters:
        data (DataFrame): The dataset containing all variables.
        data_info (dict): Dictionary containing data configuration information.

    Returns:
        DataFrame: A DataFrame containing summary statistics for each time period.
    """
    results_list = []

    for time_period in data[data_info["time"]].unique():
        # Filter the data for the current time period
        data_time_period = data[data[data_info["time"]] == time_period]

        # Define the regression formula
        formula = f"{data_info['outcome']} ~ {data_info['causal_effect']} + {' + '.join(data_info['categorical_columns'] )} + {' + '.join(data_info['control_columns'])}"

        # Fit the regression model
        reg_model = smf.ols(formula=formula, data=data_time_period)
        results = reg_model.fit()

        # Extract coefficient, std. error, and p-value
        coefficient = results.params[data_info["causal_effect"]]
        std_error = results.bse[data_info["causal_effect"]]
        p_value = results.pvalues[data_info["causal_effect"]]

        # Calculate the control mean
        control_mean = data_time_period[data_info["outcome"]].mean()

        # Calculate the difference between treatment and control groups
        difference_tc = (
            data_time_period[data_time_period[data_info["causal_effect"]] == 1][
                data_info["outcome"]
            ].mean()
            - control_mean
        )

        # Calculate the difference with controls
        # Assume mean of interaction term for simplicity
        data_time_period[data_info["causal_effect"]].mean()
        difference_tc_controls = (
            difference_tc
            - coefficient
            * (
                data_time_period[data_info["control_columns"]]
                - data_time_period[data_info["control_columns"]].mean()
            )
            .mean()
            .sum()
        )

        # Append the results to the list
        results_list.append(
            {
                "Time Period": time_period,
                "Control Mean": control_mean,
                "Difference T-C": difference_tc,
                "Difference T-C with Controls": difference_tc_controls,
                "Coefficient": coefficient,
                "Std. Error": std_error,
                "P-value": p_value,
            },
        )

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(results_list)

In [None]:
result_df = estimate_regression(data, data_info)

In [None]:
result_df

In [None]:
import matplotlib.pyplot as plt

df = pd.DataFrame(data)

time_order = ["t-2", "t-1", "t+1", "t+2", "t+3"]

# Convert Time column to categorical with defined order
df["time"] = pd.Categorical(df["time"], categories=time_order, ordered=True)


# Calculate average wage for each group over time
grouped = df.groupby(["FQ", "Reform", "time"])["wage_year"].mean().reset_index()

# Separate data for the two groups
group0 = grouped[(grouped["FQ"] == 0) & (grouped["Reform"] == 0)]
group1 = grouped[(grouped["FQ"] == 1) & (grouped["Reform"] == 1)]

# Calculate counterfactual average wage for each time period
counterfactual = group0.copy()
counterfactual["wage_year"] += group1["wage_year"].mean() - group0["wage_year"].mean()


# Plotting
plt.figure(figsize=(10, 5))
plt.plot(group0["time"], group0["wage_year"], label="Group 0", marker="o")
plt.plot(group1["time"], group1["wage_year"], label="Group 1", marker="o")
plt.plot(
    counterfactual["time"],
    counterfactual["wage_year"],
    label="Counterfactual",
    linestyle="--",
    marker="o",
)


# Add labels and legend
plt.xlabel("Time")
plt.ylabel("Average Wage")
plt.title("Difference-in-Differences Plot")
plt.legend()
plt.grid(True)

# Show plot
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()