Skip to content

Commit

Permalink
Merge pull request #8 from OpenSourceEconomics/newstruct_emily
Browse files Browse the repository at this point in the history
Separated project descriptions from project examples.
  • Loading branch information
s6emschw committed Oct 1, 2021
2 parents 8852ada + 7772a44 commit 9818d15
Show file tree
Hide file tree
Showing 12 changed files with 228 additions and 87 deletions.
4 changes: 3 additions & 1 deletion frequently-asked-questions/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ Some journals provide the data for their published articles as data supplements
What are other useful resources for research data?
==================================================

There is a tremendous amount of data available online. For example, MDRC provides a host of data files for public use `here <https://www.mdrc.org/available-public-use-files>`_ from the evaluation of public policy initiatives. More generally, `Google Dataset Search <https://datasetsearch.research.google.com>`_ allows to look for all kinds of online data.
There is a tremendous amount of data available online. For example, MDRC provides a host of data files for public use `here <https://www.mdrc.org/available-public-use-files>`_ from the evaluation of public policy initiatives. The `UC Irvine machine learning repository <https://archive-beta.ics.uci.edu/>`_ also maintains several hundred datasets. More generally, `Google Dataset Search <https://datasetsearch.research.google.com>`_ allows you to look for all kinds of online data.

A primer on finding data is available `here <https://sebastiantellotrillo.com/resources/primer-where-to-find-data>`_ on the personal website of `Prof. Sebastian Tello-Trillo <https://sebastiantellotrillo.com/>`_.

Several textbooks above provide an impressive amount of data from research articles. We provide them in a central place `online <https://github.com/OpenSourceEconomics/ose-course-projects/tree/main/datasets>`__.

Expand Down
1 change: 1 addition & 0 deletions index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Our two courses `OSE data science <https://ose-data-science.readthedocs.io>`_ an
:maxdepth: 1

projects/index
sample-projects/index
repository-template/index
reproducibility/index
frequently-asked-questions/index
Expand Down
21 changes: 16 additions & 5 deletions projects/Angrist_Krueger_1991/auxiliary/data_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def get_df_census80():
"YOB",
]

df = pd.read_csv(FILE_PATH_CENSUS80_EXTRACT, sep=" ", usecols=cols, names=cols_names)
df = pd.read_csv(
FILE_PATH_CENSUS80_EXTRACT, sep=" ", usecols=cols, names=cols_names
)

# correct AGEQ
df.loc[df["CENSUS"] == 80, "AGEQ"] = df["AGEQ"] - 1900
Expand Down Expand Up @@ -281,7 +283,16 @@ def get_further_exogenous_regressors(race=True, smsa=True, married=True):


def get_region_of_residence_dummies():
return ["NEWENG", "MIDATL", "ENOCENT", "WNOCENT", "SOATL", "ESOCENT", "WSOCENT", "MT"]
return [
"NEWENG",
"MIDATL",
"ENOCENT",
"WNOCENT",
"SOATL",
"ESOCENT",
"WSOCENT",
"MT",
]


def get_education_name():
Expand Down Expand Up @@ -318,9 +329,9 @@ def add_detrended_educational_variables(df, educ_vars=("EDUC")):

for yob in set(df["YOB"]):
for qob in set(df["QOB"]):
df.loc[(df["YOB"] == yob) & (df["QOB"] == qob), f"MV_AVG_{ev}"] = mean_ev.loc[
(yob, qob), "MV_AVG"
]
df.loc[
(df["YOB"] == yob) & (df["QOB"] == qob), f"MV_AVG_{ev}"
] = mean_ev.loc[(yob, qob), "MV_AVG"]

df[f"DTRND_{ev}"] = df[ev] - df[f"MV_AVG_{ev}"]

Expand Down
8 changes: 6 additions & 2 deletions projects/Angrist_Krueger_1991/auxiliary/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def plot_educ_against_yob(df):

points = np.array(list(zip(x_values, y_values)))[i::4]

ax.scatter(points[:, 0], points[:, 1], marker="s", s=34, color=colors[i], label=i + 1)
ax.scatter(
points[:, 0], points[:, 1], marker="s", s=34, color=colors[i], label=i + 1
)

ax.set_xlabel("Year of Birth")
ax.set_ylabel("Years Of Completed Education")
Expand Down Expand Up @@ -68,7 +70,9 @@ def plot_log_wkly_earnings_by_qob(df):
for i in range(4):

points = np.array(list(zip(x_values, y_values)))[i::4]
ax.scatter(points[:, 0], points[:, 1], marker="s", s=34, color=colors[i], label=i + 1)
ax.scatter(
points[:, 0], points[:, 1], marker="s", s=34, color=colors[i], label=i + 1
)

ax.set_xlabel("Year of Birth")
ax.set_ylabel("Log Weekly Earnings")
Expand Down
85 changes: 62 additions & 23 deletions projects/Angrist_Krueger_1991/auxiliary/regressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def get_regression_results_educational_variables(educ_vars, cohorts):
"cohort": chrt_name,
"mean": chrt[ev].mean(),
"ols": smf.ols(
formula=f"DTRND_{ev} ~ DUMMY_QOB_1 + DUMMY_QOB_2 + DUMMY_QOB_3", data=chrt
formula=f"DTRND_{ev} ~ DUMMY_QOB_1 + DUMMY_QOB_2 + DUMMY_QOB_3",
data=chrt,
).fit(),
}
)
Expand Down Expand Up @@ -268,7 +269,9 @@ def get_regression_results_ols_tsls(df, state_of_birth_dummies=False, race=True)

if state_of_birth_dummies:
formula_2nd_stage_8 += " + "
formula_2nd_stage_8 += " + ".join([f"DUMMY_STATE_{i}" for i in set(df["STATE"])])
formula_2nd_stage_8 += " + ".join(
[f"DUMMY_STATE_{i}" for i in set(df["STATE"])]
)

tsls_8 = smf.ols(formula=formula_2nd_stage_8, data=df).fit()

Expand All @@ -294,7 +297,9 @@ def __init__(self, regressionResult):
self.params = regressionResult.params
self.bse = regressionResult.bse if hasattr(regressionResult, "bse") else None
self.std_errors = (
regressionResult.std_errors if hasattr(regressionResult, "std_errors") else None
regressionResult.std_errors
if hasattr(regressionResult, "std_errors")
else None
)


Expand All @@ -304,7 +309,9 @@ def IV2SLS_wrapper(dependent, exog, endog, instruments, small_rslt=False):
# try to run the IV2SLS method without mocking the validation
try:
if small_rslt:
rslt = SmallRegressionResult(IV2SLS(dependent, exog, endog, instruments).fit())
rslt = SmallRegressionResult(
IV2SLS(dependent, exog, endog, instruments).fit()
)
else:
rslt = IV2SLS(dependent, exog, endog, instruments).fit()
except ValueError as e:
Expand All @@ -313,7 +320,9 @@ def IV2SLS_wrapper(dependent, exog, endog, instruments, small_rslt=False):
# run the IV2LS method while mocking the validation
with mock.patch("linearmodels.iv.model._IVModelBase._validate_inputs"):
if small_rslt:
rslt = SmallRegressionResult(IV2SLS(dependent, exog, endog, instruments).fit())
rslt = SmallRegressionResult(
IV2SLS(dependent, exog, endog, instruments).fit()
)
else:
rslt = IV2SLS(dependent, exog, endog, instruments).fit()

Expand All @@ -326,7 +335,9 @@ def IV2SLS_using_ols(dependent, exog, endog, instruments, small_rslt=False):
if endog is not None and instruments is not None:
# predict the endog, using the results from first stage
endog_pred = pd.Series(
data=OLS(endog=endog, exog=pd.concat((exog, instruments), axis=1)).fit().predict(),
data=OLS(endog=endog, exog=pd.concat((exog, instruments), axis=1))
.fit()
.predict(),
name=f"{endog.columns[0]}",
)
# run the second stage, effect of the predicted endog on dependent controlling for exog
Expand All @@ -335,7 +346,9 @@ def IV2SLS_using_ols(dependent, exog, endog, instruments, small_rslt=False):
OLS(endog=dependent, exog=pd.concat((exog, endog_pred), axis=1)).fit()
)
else:
rslt = OLS(endog=dependent, exog=pd.concat((exog, endog_pred), axis=1)).fit()
rslt = OLS(
endog=dependent, exog=pd.concat((exog, endog_pred), axis=1)
).fit()

else:
if small_rslt:
Expand All @@ -350,15 +363,19 @@ def IVLIML_wrapper(dependent, exog, endog, instruments, small_rslt=False):

try:
if small_rslt:
rslt = SmallRegressionResult(IVLIML(dependent, exog, endog, instruments).fit())
rslt = SmallRegressionResult(
IVLIML(dependent, exog, endog, instruments).fit()
)
else:
rslt = IVLIML(dependent, exog, endog, instruments).fit()
except ValueError as e:
print(str(e))

with mock.patch("linearmodels.iv.model._IVModelBase._validate_inputs"):
if small_rslt:
rslt = SmallRegressionResult(IVLIML(dependent, exog, endog, instruments).fit())
rslt = SmallRegressionResult(
IVLIML(dependent, exog, endog, instruments).fit()
)
else:
rslt = IVLIML(dependent, exog, endog, instruments)

Expand All @@ -375,7 +392,9 @@ def run_specification_iv2sls(df, specification, small_rslt=True):

if endg and instr:
try:
rslt = IV2SLS_using_ols(df[dpnd], df[exg], df[endg], df[instr], small_rslt)
rslt = IV2SLS_using_ols(
df[dpnd], df[exg], df[endg], df[instr], small_rslt
)
except MemoryError as e:
print(str(e))
break
Expand All @@ -401,7 +420,9 @@ def run_specification_ivliml(df, specification, small_rslt=True):

if endg and instr:
try:
rslt = IVLIML_wrapper(df[dpnd], df[exg], df[endg], df[instr], small_rslt)
rslt = IVLIML_wrapper(
df[dpnd], df[exg], df[endg], df[instr], small_rslt
)
except MemoryError as e:
print(str(e))
break
Expand Down Expand Up @@ -491,7 +512,9 @@ def get_specification_table_4_5_6():

# regression (1)
dependent.append(get_log_weekly_wage_name())
exog.append(get_constant_name() + get_education_name() + get_year_of_birth_dummy_names())
exog.append(
get_constant_name() + get_education_name() + get_year_of_birth_dummy_names()
)
endog.append(None)
instruments.append(None)
# regression (2)
Expand All @@ -511,7 +534,9 @@ def get_specification_table_4_5_6():
instruments.append(None)
# regression (4)
dependent.append(get_log_weekly_wage_name())
exog.append(get_constant_name() + get_year_of_birth_dummy_names() + get_age_control_names())
exog.append(
get_constant_name() + get_year_of_birth_dummy_names() + get_age_control_names()
)
endog.append(get_education_name())
instruments.append(get_qob_yob_interaction_names())
# regression (5)
Expand Down Expand Up @@ -585,7 +610,8 @@ def get_specification_table_7_8(state_list, race=True):
)
endog.append(get_education_name())
instruments.append(
get_qob_yob_interaction_names() + get_qob_state_of_birth_interaction_names(state_list)
get_qob_yob_interaction_names()
+ get_qob_state_of_birth_interaction_names(state_list)
)
# regression (3)
dependent.append(get_log_weekly_wage_name())
Expand All @@ -608,7 +634,8 @@ def get_specification_table_7_8(state_list, race=True):
)
endog.append(get_education_name())
instruments.append(
get_qob_yob_interaction_names() + get_qob_state_of_birth_interaction_names(state_list)
get_qob_yob_interaction_names()
+ get_qob_state_of_birth_interaction_names(state_list)
)
# regression (5)
dependent.append(get_log_weekly_wage_name())
Expand All @@ -633,7 +660,8 @@ def get_specification_table_7_8(state_list, race=True):
)
endog.append(get_education_name())
instruments.append(
get_qob_yob_interaction_names() + get_qob_state_of_birth_interaction_names(state_list)
get_qob_yob_interaction_names()
+ get_qob_state_of_birth_interaction_names(state_list)
)
# regression (7)
dependent.append(get_log_weekly_wage_name())
Expand All @@ -660,7 +688,8 @@ def get_specification_table_7_8(state_list, race=True):
)
endog.append(get_education_name())
instruments.append(
get_qob_yob_interaction_names() + get_qob_state_of_birth_interaction_names(state_list)
get_qob_yob_interaction_names()
+ get_qob_state_of_birth_interaction_names(state_list)
)

return dependent, exog, endog, instruments
Expand Down Expand Up @@ -711,7 +740,9 @@ def get_specification_weak_instruments_table_1():
+ get_region_of_residence_dummies()
)
endog.append(get_education_name())
instruments.append(get_quarter_of_birth_dummy_names() + get_qob_yob_interaction_names())
instruments.append(
get_quarter_of_birth_dummy_names() + get_qob_yob_interaction_names()
)
# regression (5)
dependent.append(get_log_weekly_wage_name())
exog.append(
Expand All @@ -734,7 +765,9 @@ def get_specification_weak_instruments_table_1():
+ get_region_of_residence_dummies()
)
endog.append(get_education_name())
instruments.append(get_quarter_of_birth_dummy_names() + get_qob_yob_interaction_names())
instruments.append(
get_quarter_of_birth_dummy_names() + get_qob_yob_interaction_names()
)

return dependent, exog, endog, instruments

Expand Down Expand Up @@ -814,7 +847,9 @@ def get_specification_mstly_hrmlss_ecnmtrcs_table_4_6_2(state_list):
instruments.append(get_quarter_of_birth_dummy_names())
# regression (2)
dependent.append(get_log_weekly_wage_name())
exog.append(get_constant_name() + get_year_of_birth_dummy_names() + get_age_control_names())
exog.append(
get_constant_name() + get_year_of_birth_dummy_names() + get_age_control_names()
)
endog.append(get_education_name())
instruments.append(get_quarter_of_birth_dummy_names())
# regression (3)
Expand All @@ -824,7 +859,9 @@ def get_specification_mstly_hrmlss_ecnmtrcs_table_4_6_2(state_list):
instruments.append(get_qob_yob_interaction_names())
# regression (4)
dependent.append(get_log_weekly_wage_name())
exog.append(get_constant_name() + get_year_of_birth_dummy_names() + get_age_control_names())
exog.append(
get_constant_name() + get_year_of_birth_dummy_names() + get_age_control_names()
)
endog.append(get_education_name())
instruments.append(get_qob_yob_interaction_names())
# regression (5)
Expand All @@ -836,7 +873,8 @@ def get_specification_mstly_hrmlss_ecnmtrcs_table_4_6_2(state_list):
)
endog.append(get_education_name())
instruments.append(
get_qob_yob_interaction_names() + get_qob_state_of_birth_interaction_names(state_list)
get_qob_yob_interaction_names()
+ get_qob_state_of_birth_interaction_names(state_list)
)
# regression (6)
dependent.append(get_log_weekly_wage_name())
Expand All @@ -848,7 +886,8 @@ def get_specification_mstly_hrmlss_ecnmtrcs_table_4_6_2(state_list):
)
endog.append(get_education_name())
instruments.append(
get_qob_yob_interaction_names() + get_qob_state_of_birth_interaction_names(state_list)
get_qob_yob_interaction_names()
+ get_qob_state_of_birth_interaction_names(state_list)
)

return dependent, exog, endog, instruments

0 comments on commit 9818d15

Please sign in to comment.