In [None]:
# %% imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import patsy

# %% 1) load, sort, diff, histogram

path = "../cleaned_data/results/no_desc_scored_full_testing.csv"
df = pd.read_csv(path, encoding="ISO-8859-1")

df = df.sort_values(["essay_id", "k"])
df["diff"] = df["score_high_full"] - df["score_low_full"]

plt.figure()
plt.hist(df["diff"].dropna(), bins=30)
plt.title("Histogram of diff = score_high - score_low")
plt.xlabel("diff")
plt.ylabel("count")
plt.show()


In [None]:
import seaborn as sns

import pandas as pd
import matplotlib.pyplot as plt

for cat in df['k'].unique():
    subset = df[df['k'] == cat]
    plt.hist(subset['score_high_full'], alpha=0.5, label=cat,bins=100)

plt.legend()
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram by Category')
plt.show()

sns.kdeplot(data=df, x="score_high_full", hue="k")
plt.show()


In [None]:
import pandas as pd
from pyfixest.estimation import feols

# Load your data

# FE regression just like: reghdfe score_high, absorb(essay_id k)
res = feols("score_high_full ~ 1 | essay_id + k", data=df)
print(res.summary)

fes = res.fixef()
df["fe_essay"] = df["essay_id"].map(fes["C(essay_id)"])
df["fe_k"] = df["essay_id"].map(fes["C(k)"])


In [None]:

# %% helper: safe mean over a boolean mask
def mean_mask(s, mask):
    s2 = s[mask & s.notna()]
    return float(s2.mean()) if len(s2) else np.nan

# %% --- Block A: "total gap"
mA = (df["low_SES"]==0) & (df["k"]==0) & df["score_high_full"].notna()
mB = (df["low_SES"]==1) & (df["k"]==0) & df["score_low_full"].notna()
tmp_vals = pd.Series(np.nan, index=df.index)
tmp_vals[mA] = df.loc[mA, "score_high_full"].astype(float)
tmp_vals[mB] = df.loc[mB, "score_low_full"].astype(float)
group = pd.Series(np.nan, index=df.index)
group[mA] = 0
group[mB] = 1

m0 = (group==0)
m1 = (group==1)

mean0 = mean_mask(tmp_vals, m0) #df[df["low_SES"]==0]['true_score'].mean() --> for now, we can check results here also (besides bin graph)
mean1 =  mean_mask(tmp_vals, m1) #df[df["low_SES"]==1]['true_score'].mean()
totalgap = mean0 - mean1

print(f"toal gap is {totalgap:9.3f}")  # (spelled as in your Stata display)

# %% --- Block B: "content gap" using __hdfe1__ at k==0, by low_ses
mK0 = (df["k"]==0)
content_mean0 = mean_mask(df["fe_essay"], mK0 & (df["low_SES"]==0))
content_mean1 = mean_mask(df["fe_essay"], mK0 & (df["low_SES"]==1))
content_gap   = content_mean0 - content_mean1

print(f"content gap is {content_gap:9.3f}")
print(f"share gap is {content_gap/totalgap:9.3f}")
df["u"] = df["score_high_full"] - df["fe_essay"]

u_mean0 = mean_mask(df["u"], mK0 & (df["low_SES"]==0))
u_mean1 = mean_mask(df["u"], mK0 & (df["low_SES"]==1))
style_gap = u_mean0 - u_mean1
print('===============')

print(f"style gap is {style_gap:9.3f}")
print(f"share gap is {style_gap/totalgap:9.3f}")
print('===============')

m_last = (df["low_SES"]==0) & (df["k"]==0)
sh_mean = mean_mask(df["score_high_full"], m_last)
sl_mean = mean_mask(df["score_low_full"],  m_last)
others_gap = sh_mean - sl_mean

print(f"diff is {others_gap:9.3f}")
print(f"others gap is {others_gap:9.3f}")
print(f"share gap is {others_gap/totalgap:9.3f}")
print()

In [None]:
# drop unnamed: 0 column
df.drop(columns=["Unnamed: 0"], inplace=True)
df.to_csv("../cleaned_data/results/decomp_no_desc.csv")

In [None]:
df.head()