# Imports and Setups

In [1]:
import plotly.express as px
import plotly.io as pio

#pio.renderers.default = "iframe_connected"
pio.renderers.default = "vscode"


In [2]:
from pathlib import Path
import sys

# Go two levels up: from notebooks/ -> parenthood_europe/ -> CodingProjects/
project_root = Path().resolve().parents[1]
sys.path.insert(0, str(project_root))
sys.path.append (str(project_root)+"/libs")


In [3]:
list(Path().resolve().parents)


[PosixPath('/Users/Paula_1/CodingProjects/parenthood/parenthood_europe/notebooks'),
 PosixPath('/Users/Paula_1/CodingProjects/parenthood/parenthood_europe'),
 PosixPath('/Users/Paula_1/CodingProjects/parenthood'),
 PosixPath('/Users/Paula_1/CodingProjects'),
 PosixPath('/Users/Paula_1'),
 PosixPath('/Users'),
 PosixPath('/')]

In [4]:
(str(project_root)+"/libs")


'/Users/Paula_1/CodingProjects/parenthood/parenthood_europe/libs'

In [5]:
from scripts.parse_survey_data import load_survey_data, load_survey_data_and_meta
from libs.questions.numeric import NumericQuestion
from libs.questions.single_choice import SingleChoiceQuestion
from libs.questions.multiple_choice import MultipleChoiceQuestion
from libs.questions.matrix import MatrixQuestion

In [6]:
from pathlib import Path
print("Current working directory:", Path().resolve())


Current working directory: /Users/Paula_1/CodingProjects/parenthood/parenthood_europe/notebooks/testing


# General Data

In [7]:
import pandas as pd
df_academia = pd.read_excel("../../data/original_raw_data/Parenthood in Academia_November 7, 2024_15.52.xlsx")
df_academia.head()


Workbook contains no default style, apply openpyxl's default



Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,LA2,LA3,LA4a,LA4b,C1,C1_3_TEXT,C1_4_TEXT,C1_5_TEXT,C2,gender
0,Start Date,End Date,Response Type,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Distribution Channel,User Language,...,How would you estimate the percentage of women...,Have you considered leaving academia?,Please briefly elaborate on your reasons for c...,Please briefly elaborate on your reasons for l...,How did you hear about this survey? - Selected...,How did you hear about this survey? - Social m...,How did you hear about this survey? - Mailing ...,How did you hear about this survey? - Other (p...,Do you have any other comments related to this...,gender
1,2024-03-25 15:45:50,2024-03-25 15:58:19,0,100,748,1,2024-03-25 15:58:20.398000,R_2jpMi7QsRScengv,email,EN,...,,,,,,,,,,Woman
2,2024-03-26 17:00:29,2024-03-26 17:17:43,0,100,1034,1,2024-03-26 17:17:44.908000,R_4ScOupmxBzBk7dL,email,EN,...,,,,,1,,,,,Man
3,2024-03-29 16:09:27,2024-03-29 16:39:04,0,100,1777,1,2024-03-29 16:39:07.534000,R_77h3lBm8zYTS5a1,email,EN,...,3,3,"The path demands significant sacrifices, espec...",,1,,,,The questions about # of papers expected or pr...,Woman
4,2024-04-16 12:45:38,2024-04-16 12:55:21,0,100,583,1,2024-04-16 12:55:22.733000,R_2V4kjf4gOQNH6EL,anonymous,EN,...,4,4,,,1,,,,,Woman


In [8]:
df_academia.iloc[[0, 1, 2]]

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,LA2,LA3,LA4a,LA4b,C1,C1_3_TEXT,C1_4_TEXT,C1_5_TEXT,C2,gender
0,Start Date,End Date,Response Type,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Distribution Channel,User Language,...,How would you estimate the percentage of women...,Have you considered leaving academia?,Please briefly elaborate on your reasons for c...,Please briefly elaborate on your reasons for l...,How did you hear about this survey? - Selected...,How did you hear about this survey? - Social m...,How did you hear about this survey? - Mailing ...,How did you hear about this survey? - Other (p...,Do you have any other comments related to this...,gender
1,2024-03-25 15:45:50,2024-03-25 15:58:19,0,100,748,1,2024-03-25 15:58:20.398000,R_2jpMi7QsRScengv,email,EN,...,,,,,,,,,,Woman
2,2024-03-26 17:00:29,2024-03-26 17:17:43,0,100,1034,1,2024-03-26 17:17:44.908000,R_4ScOupmxBzBk7dL,email,EN,...,,,,,1,,,,,Man


In [9]:
print("Earliest StartDate:", df_academia["StartDate"].iloc[1:].min())
print("Latest EndDate:", df_academia["EndDate"].iloc[1:].max())

Earliest StartDate: 2024-03-25 15:45:50
Latest EndDate: 2024-08-20 14:14:17


In [10]:
df_academia.iloc[[0]]

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,LA2,LA3,LA4a,LA4b,C1,C1_3_TEXT,C1_4_TEXT,C1_5_TEXT,C2,gender
0,Start Date,End Date,Response Type,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Distribution Channel,User Language,...,How would you estimate the percentage of women...,Have you considered leaving academia?,Please briefly elaborate on your reasons for c...,Please briefly elaborate on your reasons for l...,How did you hear about this survey? - Selected...,How did you hear about this survey? - Social m...,How did you hear about this survey? - Mailing ...,How did you hear about this survey? - Other (p...,Do you have any other comments related to this...,gender


The dataframe seems to store the same content he in the first data row iloc[0] as in the col name row, which contains the question ID in the col name row and the Question posed (text). We must therefore begin to count the respondent ID from row index iloc[1].

In [11]:
unique_ids = df_academia.loc[1:, "ResponseId"].nunique()
print("Number of unique respondent IDs (from row 1):", unique_ids)

Number of unique respondent IDs (from row 1): 10071


# Numeric Child Question 

In [14]:
# df only containing questions that were categorized as numeric to improove speed
df_numeric, meta_numeric = load_survey_data_and_meta(file_path = "../../data/data_classified_by_question_type/parenthood_test_QNumeric.xlsx")
print(meta_numeric["DE1"])


In what year were you born?


In [15]:
print(set(type(col) for col in df_numeric.columns))
print(set(type(col) for col in meta_numeric))


{<class 'str'>}
{<class 'str'>}


In [16]:
print(type(df_numeric.loc[3000, "DE1"]))
print(type(df_numeric.loc[2222, "DE1"]))
print(type(df_numeric.loc[222, "DE1"]))




<class 'int'>
<class 'int'>
<class 'float'>


In [17]:
print("Number of rows in df:", len(df_numeric))


Number of rows in df: 10071


In [18]:
#df_numeric, meta_numeric = load_survey_data_and_meta(file_path = "../../data/data_classified_by_question_type/parenthood_test_QNumeric.xlsx")

q1 = NumericQuestion("DE1", df_numeric, meta_numeric)
fig1 = q1.distribution(display=False)
fig1.show()

#pio.write_html(fig1, "DE1.html", include_plotlyjs='cdn', full_html=False)

In [19]:
q10 = NumericQuestion("DE10", df_numeric, meta_numeric)
fig10 = q10.distribution(display=False)
fig10.show()

#pio.write_html(fig10, "DE10.html", include_plotlyjs='cdn', full_html=False)

In [21]:
print([col for col in df_numeric.columns if col.startswith("DE10")])


['DE10_1', 'DE10_3', 'DE10_4', 'DE10_5', 'DE10_6', 'DE10_7', 'DE10_9']


In [22]:
numeric_question_ids = ["DE1", "DE10", "DE11", "DE22"]
for qid in numeric_question_ids:
    q = NumericQuestion(qid, df_numeric, meta_numeric)
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()


# Single Choice Question

In [23]:
df_single, meta_single = load_survey_data_and_meta(file_path = "../../data/data_classified_by_question_type/parenthood_test_QSingle.xlsx")


In [24]:
single_choice_question_ids = ["DE2", "DE4", "DE5", "DE6", "DE8", "DE12", "DE17", "DE18", "DE19", "DE20", "DE21", "PL3", "PL5", "PL8", "PL10", "CS2"]
figs = []
for qid in single_choice_question_ids:
    q = SingleChoiceQuestion(qid, df_single, meta_single)
    fig = q.distribution(display=False)
    figs.append(fig)

for fig in figs:
    if fig is not None:
        fig.show()



In [25]:
print("Column DE2 type:", df_single["DE2"].dtype)
print("Unique values in DE2:", df_single["DE2"].unique())


Column DE2 type: object
Unique values in DE2: [1 2 nan 3 4]


In [26]:
value = df_single["DE2"].iloc[3]  # 3 means the 4th row (0-based indexing)
print("Value at row 3, col 'DE2':", value)
print("Type of that value:", type(value))


Value at row 3, col 'DE2': 1
Type of that value: <class 'int'>


# Multiple Choice Question

In [27]:
df_multiple, meta_multiple = load_survey_data_and_meta(file_path = "../../data/data_classified_by_question_type/parenthood_test_QMultiple.xlsx")
print(meta_multiple["DE24"])

Could you please share the primary reason for not having children? - Selected Choice


In [29]:
value = df_multiple["DE3"].iloc[3]  
print("Value at row 3, col 'DE3':", value)
print("Type of that value:", type(value))


Value at row 3, col 'DE3': 11
Type of that value: <class 'str'>


In [30]:
# MultipleChoice Question

#df_multiple, meta_multiple = load_survey_data_and_meta(file_path="../../data/data_classified_by_question_type/parenthood_test_QMultiple.xlsx") #TODO: how to plot multiple choice questions?

multiple_choice_question_ids = ["DE3", "DE7", "DE9", "DE24"] 

for qid in multiple_choice_question_ids:
    q = MultipleChoiceQuestion(qid, df_multiple, meta_multiple)
    fig = q.distribution(display=False)
    #if qid in {"DE3"}: fig.write_html(f"{qid}_plot.html")
    if fig is not None:
        fig.show()


# Matrix Question 

## PL2

In [12]:
df_matrix, meta_matrix = load_survey_data_and_meta(file_path="../../data/data_classified_by_question_type/parenthood_test_QMatrix.xlsx")

display(df_matrix.head(5))
display(meta_matrix)

Unnamed: 0,ResponseId,DE13c,DE13c_1_TEXT,DE13c_2_TEXT,DE13c_3_TEXT,DE13c_4_TEXT,DE13c_5_TEXT,DE13c_6_TEXT,DE13c_7_TEXT,DE14_1,...,LA2,LA3,LA4a,LA4b,C1,C1_3_TEXT,C1_4_TEXT,C1_5_TEXT,C2,gender
0,R_2jpMi7QsRScengv,,,,,,,,,1.0,...,,,,,,,,,,Woman
1,R_4ScOupmxBzBk7dL,,,,,,,,,2.0,...,,,,,1.0,,,,,Man
2,R_77h3lBm8zYTS5a1,,,,,,,,,1.0,...,3.0,3.0,"The path demands significant sacrifices, espec...",,1.0,,,,The questions about # of papers expected or pr...,Woman
3,R_2V4kjf4gOQNH6EL,1234.0,15.0,224.0,9.0,8.0,,,,1.0,...,4.0,4.0,,,1.0,,,,,Woman
4,R_2tWf358NIjM0gSd,,,,,,,,,,...,,,,,,,,,,


{'ResponseId': 'Response ID',
 'DE13c': 'We acknowledge your privacy considerations. Would you be willing to share\nat least the following metrics regarding your academic advancement? This\nallows us to gain insights into your academic standing and accomplishments\nwithout directly connecting them to your personal information. You may use\nthe statistics shown in any of your academic profiles, e.g., Google Scholar, Research Gate, Web of Science, Scopus, ORCID, or OpenAlex.\n\nThank you. - Selected Choice',
 'DE13c_1_TEXT': 'We acknowledge your privacy considerations. Would you be willing to share\nat least the following metrics regarding your academic advancement? This\nallows us to gain insights into your academic standing and accomplishments\nwithout directly connecting them to your personal information. You may use\nthe statistics shown in any of your academic profiles, e.g., Google Scholar, Research Gate, Web of Science, Scopus, ORCID, or OpenAlex.\n\nThank you. - Total number of p

Let us remove the useless first two rows in df_raw

In [13]:
columns_to_check = ["PL2_1_4", "PL2_2_4", "PL2_3_4", "PL2_4_4", "PL2_5_4"]
contains_99 = df_matrix[columns_to_check].isin([99]).any().any()
print("Does any of the specified columns contain the value 99?", contains_99)

Does any of the specified columns contain the value 99? True


In [14]:
rows_with_99 = df_matrix[df_matrix[columns_to_check].isin([99]).any(axis=1)]
rows_with_99_filtered = rows_with_99[columns_to_check]
display(rows_with_99_filtered)

Unnamed: 0,PL2_1_4,PL2_2_4,PL2_3_4,PL2_4_4,PL2_5_4
2530,99,99,99,99,99


This value doesn't make sense, so we replace it with 0

In [15]:
df_matrix[columns_to_check] = df_matrix[columns_to_check].replace(99, 0)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [None]:
# MatrixChoice Question
# true identification of the parent (their gender or identity) only exists because anchored with DE14. We assume in further questions they will keep the order to answer to the gender to parent 1 and gender of parent 2.

df_matrix, meta_matrix = load_survey_data(file_path="../../data/data_classified_by_question_type/parenthood_test_QMatrix.xlsx")

gender_lookup = {}

multiple_choice_question_ids = ["DE14", "DE15", "DE16", "DE23"]# , "PL1","PL2"]#, "PL4", "PL6", "PL7", "PL9", "CS1", "CS3", "CC1", "CC2", "GB4", "GB5", ] #DE23?

for qid in multiple_choice_question_ids:
    q = MatrixQuestion(qid, df_matrix, meta_matrix, gender_lookup=gender_lookup)
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()


In [None]:
gender_questions = {"DE14", "DE15", "DE16", "DE23"}
matrix_question_ids = ["DE14", "DE15", "DE16", "DE23", "PL1", "PL2"]#, "PL4", "PL6", "PL7", "PL9", "CS1", "CS3", "CC1", "CC2", "GB4", "GB5", ] #DE23?
#gender_lookup = {}


for qid in matrix_question_ids:
    anchor_type = "parent_gender" if qid in gender_questions else "none"
    q = MatrixQuestion(qid, df_matrix, meta_matrix, anchor_type=anchor_type)
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()


In [None]:
matrix_question_ids_tmp = ["PL2"]  # Compare DE23 with PL2.  Basically, the groups should be the regions
gender_lookup_tmp = {}


for qid in matrix_question_ids_tmp:
    q = MatrixQuestion(qid, df_matrix, meta_matrix, gender_lookup=gender_lookup_tmp, anchor_type="none")
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()

# DE23

In [43]:
#TODO: THIS WORKS.  Correct Serbia and Montenegro

matrix_question_ids_tmp = ["DE23"]  # Compare DE23 with PL2.  Basically, the groups should be the regions
gender_lookup_tmp = {}


for qid in matrix_question_ids_tmp:
    q = MatrixQuestion(qid, df_matrix, meta_matrix, gender_lookup=gender_lookup_tmp, anchor_type="none")
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()

In [None]:
# Questions about Beliefs about gender (in)equality: (all numeric) IN1 (window in percent),  IN2(in percent), IN3 (in years)
# Descriptive and injunctive social norms on academic productivity: AP1 (matrix styled: scholarly works in numbers), AP2 (singlechoice), AP3 (singlechoice)
# Network Satisfaction GB1a (singlechoice), GB1b (singlechoice), GB1c (singlechoice), GB2a (singlechoice), GB2b (singlechoice), GB3 (matrix) 
# Leaving Academia: LA1, 2, 3 (singlechoice), LA4a, b (Text Input window)
# Final Questions C1 singlechoice, C2 (text input window)

# Refactor checks/ tests

Def as_frame to get quickly variables to merge for hypothesis testing

In [7]:
df_numeric, meta_numeric = load_survey_data_and_meta(file_path = "../../data/data_classified_by_question_type/parenthood_test_QNumeric.xlsx")
df_single, meta_single = load_survey_data_and_meta(file_path = "../../data/data_classified_by_question_type/parenthood_test_QSingle.xlsx")
df_multiple, meta_multiple = load_survey_data_and_meta(file_path = "../../data/data_classified_by_question_type/parenthood_test_QMultiple.xlsx")
df_matrix, meta_matrix = load_survey_data_and_meta(file_path="../../data/data_classified_by_question_type/parenthood_test_QMatrix.xlsx")

In [8]:
gender = SingleChoiceQuestion("DE2", df_single, meta_single).as_frame()       # gender identity
leave  = MatrixQuestion("PL2", df_matrix, meta_matrix).as_frame()          # leave length (months)

merged = gender.merge(leave, on="ResponseId", how="inner", suffixes=("_gender", "_leave"))
print(merged.head())


Empty DataFrame
Columns: [ResponseId, value_gender, value_leave]
Index: []


Migrate Plotting logic into its own plotting.py file

In [9]:
from libs.plotting import bar, hist, grouped_bar
import pandas as pd

# Mini test for bar
fig = bar(["A", "B", "C"], [40, 35, 25], title="Dummy distribution")
fig.show()


In [10]:
# restart kernel or reload modules first
from libs.questions.single_choice import SingleChoiceQuestion
fig = SingleChoiceQuestion("DE2", df_single, meta_single).distribution()


In [11]:
NumericQuestion("DE1", df_numeric, meta_numeric).distribution()

In [11]:
MultipleChoiceQuestion("DE7", df_multiple, meta_multiple).distribution()

In [8]:
from importlib import reload
import libs.questions.matrix
reload(libs.questions.matrix)

MatrixQuestion("PL2", df_matrix, meta_matrix).distribution()


In [9]:
from importlib import reload
import libs.questions.matrix; reload(libs.questions.matrix)

MatrixQuestion("DE23", df_matrix, meta_matrix).distribution()


In [10]:
# Parent‑gender grouped DE14 chart
MatrixQuestion("DE14", df_matrix, meta_matrix, anchor_type="parent_gender").distribution()


In [12]:
# --- peek at the raw sheet -------------------------------------------------
cols = [c for c in df_matrix.columns if c.startswith(("DE23_", "PL2_"))][:20]   # 20 cols max
display(df_matrix.loc[:2, cols])            # rows 0-2 only


Unnamed: 0,DE23_1_1,DE23_2_1,DE23_3_1,DE23_4_1,DE23_5_1,DE23_6_1,DE23_7_1,DE23_8_1,DE23_9_1,DE23_10_1,DE23_1_2,DE23_2_2,DE23_3_2,DE23_4_2,DE23_5_2,DE23_6_2,DE23_7_2,DE23_8_2,DE23_9_2,DE23_10_2
0,8.0,4.0,,,,,,,,,90.0,90.0,,,,,,,,
1,13.0,9.0,,,,,,,,,188.0,188.0,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,


In [13]:
# --- peek at the raw sheet -------------------------------------------------
cols = [c for c in df_matrix.columns if c.startswith(("PL2_"))][:20]   # 20 cols max
display(df_matrix.loc[:2, cols])            # rows 0-2 only


Unnamed: 0,PL2_1_1,PL2_1_2,PL2_1_3,PL2_1_4,PL2_2_1,PL2_2_2,PL2_2_3,PL2_2_4,PL2_3_1,PL2_3_2,PL2_3_3,PL2_3_4,PL2_4_1,PL2_4_2,PL2_4_3,PL2_4_4,PL2_5_1,PL2_5_2,PL2_5_3,PL2_5_4
0,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,


In [14]:
import pandas as pd
#  quick check of the very first row under each prefix
def first_non_nan(series):
    for v in series:
        if pd.notna(v): return v
    return None

for qid in ["DE14"]:
    first_row = df_matrix.iloc[0][df_matrix.columns.str.startswith(qid)]
    print(qid, first_non_nan(first_row))


DE14 1


In [7]:
df_matrix, meta_matrix = load_survey_data_and_meta("../../data/data_classified_by_question_type/parenthood_test_QMatrix.xlsx")


In [24]:
print(df_matrix.loc[:2, "DE23_1_1"])

0      8
1     13
2    NaN
Name: DE23_1_1, dtype: object


In [25]:
print(meta_matrix.loc[:2, "DE23_1_1"])

AttributeError: 'dict' object has no attribute 'loc'

In [26]:
meta_matrix

{'ResponseId': 'Response ID',
 'DE13c': 'We acknowledge your privacy considerations. Would you be willing to share\nat least the following metrics regarding your academic advancement? This\nallows us to gain insights into your academic standing and accomplishments\nwithout directly connecting them to your personal information. You may use\nthe statistics shown in any of your academic profiles, e.g., Google Scholar, Research Gate, Web of Science, Scopus, ORCID, or OpenAlex.\n\nThank you. - Selected Choice',
 'DE13c_1_TEXT': 'We acknowledge your privacy considerations. Would you be willing to share\nat least the following metrics regarding your academic advancement? This\nallows us to gain insights into your academic standing and accomplishments\nwithout directly connecting them to your personal information. You may use\nthe statistics shown in any of your academic profiles, e.g., Google Scholar, Research Gate, Web of Science, Scopus, ORCID, or OpenAlex.\n\nThank you. - Total number of p

In [27]:
df_matrix

Unnamed: 0,ResponseId,DE13c,DE13c_1_TEXT,DE13c_2_TEXT,DE13c_3_TEXT,DE13c_4_TEXT,DE13c_5_TEXT,DE13c_6_TEXT,DE13c_7_TEXT,DE14_1,...,LA2,LA3,LA4a,LA4b,C1,C1_3_TEXT,C1_4_TEXT,C1_5_TEXT,C2,gender
0,R_2jpMi7QsRScengv,,,,,,,,,1,...,,,,,,,,,,Woman
1,R_4ScOupmxBzBk7dL,,,,,,,,,2,...,,,,,1,,,,,Man
2,R_77h3lBm8zYTS5a1,,,,,,,,,1,...,3,3,"The path demands significant sacrifices, espec...",,1,,,,The questions about # of papers expected or pr...,Woman
3,R_2V4kjf4gOQNH6EL,1234,15,224,9,8,,,,1,...,4,4,,,1,,,,,Woman
4,R_2tWf358NIjM0gSd,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10066,R_2qkKCpL949Bd0co,,,,,,,,,,...,,,,,,,,,,
10067,R_2IQbjf8O2JcA6bD,1234567,19,121,6,4,20,102.5,,1,...,,,,,,,,,,
10068,R_81i2sSiDUow0OOT,,,,,,,,,,...,,,,,,,,,,
10069,R_21ToiBGSNZUy2yl,,,,,,,,,,...,,,,,,,,,,


In [11]:
MatrixQuestion("PL4", df_matrix, meta_matrix).distribution() #"PL7", "PL9", "CS1", "CS3", "CC1", "CC2", "GB4", "GB5", ]

In [12]:
MatrixQuestion("PL6", df_matrix, meta_matrix).distribution()

In [15]:
MatrixQuestion("PL7", df_matrix, meta_matrix).distribution()

In [16]:
MatrixQuestion("PL9", df_matrix, meta_matrix).distribution()

KeyError: 'Group'