# Imports and Setups

In [23]:
import plotly.express as px
import plotly.io as pio

#pio.renderers.default = "iframe_connected"
pio.renderers.default = "vscode"


In [24]:
from pathlib import Path
import sys

# Go two levels up: from notebooks/ -> parenthood_europe/ -> CodingProjects/
project_root = Path().resolve().parents[1]
sys.path.insert(0, str(project_root))
sys.path.append (str(project_root)+"/libs")


In [25]:
list(Path().resolve().parents)


[PosixPath('/Users/Paula_1/CodingProjects/parenthood/parenthood_europe/notebooks'),
 PosixPath('/Users/Paula_1/CodingProjects/parenthood/parenthood_europe'),
 PosixPath('/Users/Paula_1/CodingProjects/parenthood'),
 PosixPath('/Users/Paula_1/CodingProjects'),
 PosixPath('/Users/Paula_1'),
 PosixPath('/Users'),
 PosixPath('/')]

In [26]:
(str(project_root)+"/libs")


'/Users/Paula_1/CodingProjects/parenthood/parenthood_europe/libs'

In [27]:
from scripts.parse_survey_data import load_survey_data, load_survey_data_and_meta
from libs.questions.numeric import NumericQuestion
from libs.questions.single_choice import SingleChoiceQuestion
from libs.questions.multiple_choice import MultipleChoiceQuestion
from libs.questions.matrix import MatrixQuestion

In [28]:
from pathlib import Path
print("Current working directory:", Path().resolve())


Current working directory: /Users/Paula_1/CodingProjects/parenthood/parenthood_europe/notebooks/testing


# Data

In [21]:
df, meta = load_survey_data_and_meta(file_path="../../data/data_classified_by_question_type/parenthood_test_QMatrix.xlsx")

### want to check PL2

In [9]:
display(df.head(5))
display(meta)

Unnamed: 0,ResponseId,DE13c,DE13c_1_TEXT,DE13c_2_TEXT,DE13c_3_TEXT,DE13c_4_TEXT,DE13c_5_TEXT,DE13c_6_TEXT,DE13c_7_TEXT,DE14_1,...,LA2,LA3,LA4a,LA4b,C1,C1_3_TEXT,C1_4_TEXT,C1_5_TEXT,C2,gender
0,R_2jpMi7QsRScengv,,,,,,,,,1.0,...,,,,,,,,,,Woman
1,R_4ScOupmxBzBk7dL,,,,,,,,,2.0,...,,,,,1.0,,,,,Man
2,R_77h3lBm8zYTS5a1,,,,,,,,,1.0,...,3.0,3.0,"The path demands significant sacrifices, espec...",,1.0,,,,The questions about # of papers expected or pr...,Woman
3,R_2V4kjf4gOQNH6EL,1234.0,15.0,224.0,9.0,8.0,,,,1.0,...,4.0,4.0,,,1.0,,,,,Woman
4,R_2tWf358NIjM0gSd,,,,,,,,,,...,,,,,,,,,,


{'ResponseId': 'Response ID',
 'DE13c': 'We acknowledge your privacy considerations. Would you be willing to share\nat least the following metrics regarding your academic advancement? This\nallows us to gain insights into your academic standing and accomplishments\nwithout directly connecting them to your personal information. You may use\nthe statistics shown in any of your academic profiles, e.g., Google Scholar, Research Gate, Web of Science, Scopus, ORCID, or OpenAlex.\n\nThank you. - Selected Choice',
 'DE13c_1_TEXT': 'We acknowledge your privacy considerations. Would you be willing to share\nat least the following metrics regarding your academic advancement? This\nallows us to gain insights into your academic standing and accomplishments\nwithout directly connecting them to your personal information. You may use\nthe statistics shown in any of your academic profiles, e.g., Google Scholar, Research Gate, Web of Science, Scopus, ORCID, or OpenAlex.\n\nThank you. - Total number of p

Let us remove the useless first two rows in df_raw

In [10]:
columns_to_check = ["PL2_1_4", "PL2_2_4", "PL2_3_4", "PL2_4_4", "PL2_5_4"]
contains_99 = df[columns_to_check].isin([99]).any().any()
print("Does any of the specified columns contain the value 99?", contains_99)

Does any of the specified columns contain the value 99? True


In [11]:
rows_with_99 = df[df[columns_to_check].isin([99]).any(axis=1)]
rows_with_99_filtered = rows_with_99[columns_to_check]
display(rows_with_99_filtered)

Unnamed: 0,PL2_1_4,PL2_2_4,PL2_3_4,PL2_4_4,PL2_5_4
2530,99,99,99,99,99


This value doesn't make sense, so we replace it with 0

In [12]:
df[columns_to_check] = df[columns_to_check].replace(99, 0)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [13]:
# MatrixChoice Question
# true identification of the parent (their gender or identity) only exists because anchored with DE14. We assume in further questions they will keep the order to answer to the gender to parent 1 and gender of parent 2.

#df, df_raw = load_survey_data(file_path="../data/parenthood_test_QMatrix.xlsx")

gender_lookup = {}

multiple_choice_question_ids = ["DE14", "DE15", "DE16", "DE23", "PL1",]# "PL2"]#, "PL4", "PL6", "PL7", "PL9", "CS1", "CS3", "CC1", "CC2", "GB4", "GB5", ] #DE23?

for qid in multiple_choice_question_ids:
    q = MatrixQuestion(qid, df, meta, gender_lookup=gender_lookup)
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()


In [14]:
gender_questions = {"DE14", "DE15", "DE16", "DE23"}
matrix_question_ids = ["DE14", "DE15", "DE16", "DE23", "PL1", "PL2"]#, "PL4", "PL6", "PL7", "PL9", "CS1", "CS3", "CC1", "CC2", "GB4", "GB5", ] #DE23?
gender_lookup = {}


for qid in matrix_question_ids:
    anchor_type = "parent_gender" if qid in gender_questions else "none"
    q = MatrixQuestion(qid, df, meta, gender_lookup=gender_lookup, anchor_type=anchor_type)
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()


Group: Faculty (tenure-track), Count sum: 3135
Group: Faculty (tenured), Count sum: 3329
Group: Faculty (untenured), Count sum: 2998
Group: PhD students, Count sum: 2745
Group: Postdocs, Count sum: 2931


Group: Faculty (tenure-track), Count sum: 909
Group: Faculty (untenured), Count sum: 187
Group: PhD students, Count sum: 3077
Group: Postdocs, Count sum: 5434








In [15]:
matrix_question_ids_tmp = ["PL2"]  # Compare DE23 with PL2.  Basically, the groups should be the regions
gender_lookup_tmp = {}


for qid in matrix_question_ids_tmp:
    q = MatrixQuestion(qid, df, meta, gender_lookup=gender_lookup_tmp, anchor_type="none")
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()

Group: Faculty (tenure-track), Count sum: 909
Group: Faculty (untenured), Count sum: 187
Group: PhD students, Count sum: 3077
Group: Postdocs, Count sum: 5434








# DE23

In [16]:
#TODO: THIS WORKS.  Correct Serbia and Montenegro

matrix_question_ids_tmp = ["DE23"]  # Compare DE23 with PL2.  Basically, the groups should be the regions
gender_lookup_tmp = {}


for qid in matrix_question_ids_tmp:
    q = MatrixQuestion(qid, df, meta, gender_lookup=gender_lookup_tmp, anchor_type="none")
    fig = q.distribution(display=False)
    if fig is not None:
        fig.show()

In [17]:
value = df["DE14_1"].iloc[3]  # 3 means the 4th row (0-based indexing)
print("Value at row 3, col 'DE14_1':", value)
print("Type of that value:", type(value))

Value at row 3, col 'DE14_1': 1
Type of that value: <class 'int'>


In [30]:
import pandas as pd 

class Respondent:
    def __init__(self, respondent_id, df, meta):
        self.id = respondent_id
        self.row = df.loc[respondent_id]
        self.metadata = meta

    def get_parent_gender(self, parent_number):
        col = f"DE14_{parent_number}"
        try:
            code = self.row[col]
            if pd.isna(code):
                return None
            return self.metadata["DE14"]["value_map"].get(int(code))
        except (KeyError, ValueError, TypeError):
            return None

    def get_answer(self, question_id, parent_number):
        col = f"{question_id}_{parent_number}"
        try:
            return self.row[col]
        except KeyError:
            return None


In [31]:
respondent = Respondent(42, df, meta)

print(respondent.get_parent_gender("1"))  # → "Woman"
print(respondent.get_answer("DE15", "1"))  # → e.g., 2 (education level)


None
7


In [None]:
# Questions about Beliefs about gender (in)equality: (all numeric) IN1 (window in percent),  IN2(in percent), IN3 (in years)
# Descriptive and injunctive social norms on academic productivity: AP1 (matrix styled: scholarly works in numbers), AP2 (singlechoice), AP3 (singlechoice)
# Network Satisfaction GB1a (singlechoice), GB1b (singlechoice), GB1c (singlechoice), GB2a (singlechoice), GB2b (singlechoice), GB3 (matrix) 
# Leaving Academia: LA1, 2, 3 (singlechoice), LA4a, b (Text Input window)
# Final Questions C1 singlechoice, C2 (text input window)