In [22]:
import pandas as pd
import re 
import numpy as np
import os
import copy
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
import stata_setup
stata_setup.config('/Applications/Stata 17/', 'se')
from pystata import stata

In [2]:
data_dir = r"/Desktop/Paper - parental and career"

In [3]:
#parental_df = pd.read_csv(os.path.join(data_dir,"parental_df.csv"), sep='\t')
parental_df = pd.read_csv(os.path.join(data_dir, "parental_df.csv"), sep=None, engine="python")
parental_df

Unnamed: 0,A,SurveyTableID,StartDate,EndDate,Status,IPAddress,Progress,Duration,Finished,RecordedDate,...,category_ega_attitude,RPauTen_inter,RSche_inter,RLeave_inter,RCdcare_inter,G_Leave_inter,G_PauTen_inter,G_Sche_inter,G_Cdcare_inter,ER_inter
0,1635,1962,20sep2019 10:50:00,20sep2019 11:04:59,IP Address,129.97.178.248,100,871,1,20sep2019 11:04:59,...,3,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.00
1,115,131,06sep2019 18:08:00,06sep2019 18:27:00,IP Address,206.166.196.57,100,1131,1,06sep2019 18:27:00,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.25
2,5863,7471,08oct2019 07:19:59,08oct2019 07:31:00,IP Address,147.9.2.224,100,655,1,08oct2019 07:31:00,...,3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.00
3,3121,3950,27sep2019 21:43:59,27sep2019 22:02:00,IP Address,99.32.23.136,100,1061,1,27sep2019 22:02:00,...,3,,,,,,,,,
4,1708,2048,20sep2019 11:10:00,20sep2019 11:26:59,IP Address,74.217.93.204,100,1027,1,20sep2019 11:26:59,...,2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7731,734,925,18sep2019 21:08:00,18sep2019 21:22:00,IP Address,66.241.87.10,100,825,1,18sep2019 21:22:00,...,2,,,,,,,,,
7732,4335,5518,04oct2019 17:12:00,04oct2019 17:21:00,IP Address,128.95.25.248,100,542,1,04oct2019 17:21:00,...,2,,,,,,,,,
7733,3529,4433,29sep2019 01:52:00,29sep2019 01:57:00,IP Address,217.85.169.133,100,296,1,29sep2019 01:57:00,...,2,,,,,,,,,
7734,6908,8887,12oct2019 15:27:59,12oct2019 15:33:59,IP Address,50.199.192.37,100,349,1,12oct2019 15:33:59,...,2,,,,,,,,,


In [6]:
# Filter the DataFrame where married_child == 3
filtered_df = parental_df[(parental_df['married_child'] == 1) | 
                          (parental_df['married_child'] == 3)]

# Columns for which you want to calculate mean and sd
columns_to_analyze = ['rear_load', 'res_ach', 'car_dev', 
                      'contrib', 'rel_npap', 'rel_cite', 'uni_cola', 'childsuffer_ega', 'mo_relation_ega', 'moth_to_work', 'wocutwork_ega', 'ega_attitude']

# Group by 'gender_W'
grouped = filtered_df.groupby('gender_W')

df_gender_count= grouped[columns_to_analyze].agg(['mean', 'std']).stack(level=0).reset_index()
df_gender_count.columns = ['gender', 'var', 'mean', 'std']
# replace gender values
df_gender_count['gender'] = df_gender_count['gender'].map({0:'Men', 1:'Women'})
# replace var names with the name_map
name_map = {'car_dev': 'Career satisfaction','contrib': 'Community recognition', 
            'rear_load': 'Childcare responsibilities', 'rel_cite':'ARC', 'rel_npap':'ARP', 'res_ach':'Research satisfaction', 'uni_cola':'ARCo',
            'childsuffer_ega': 'Child suffering', 'mo_relation_ega': 'Mother relationship', 'moth_to_work': 'Mother to work', 'wocutwork_ega': 'Women to cut work', 'ega_attitude': 'Egalitarian beliefs'}

df_gender_count['var']= df_gender_count['var'].map(name_map)

# Define the desired order
order = ['Childcare responsibilities', 'Egalitarian beliefs', 'ARCo', 'ARC', 'ARP', 
    'Community recognition','Career satisfaction', 'Research satisfaction' ,
    'Child suffering', 'Mother relationship', 'Mother to work', 'Women to cut work'
]

# Convert 'var' to a categorical type with the defined order
df_gender_count['var'] = pd.Categorical(df_gender_count['var'], categories=order, ordered=True)

# Sort the DataFrame by 'var'
df_gender_count = df_gender_count.sort_values('var')

df_gender_count





Unnamed: 0,gender,var,mean,std
18,Women,Childcare responsibilities,2.129263,1.154497
6,Men,Childcare responsibilities,1.974328,1.177702
3,Men,Egalitarian beliefs,0.466858,0.965398
15,Women,Egalitarian beliefs,1.298512,0.877616
10,Men,ARCo,1.083716,1.389866
22,Women,ARCo,0.981357,1.1545
19,Women,ARC,2.141185,3.139571
7,Men,ARC,2.277524,3.81489
8,Men,ARP,2.271846,3.030543
20,Women,ARP,1.813911,1.801113


In [7]:
sample_size = grouped.size().reset_index(name='n')

# Map numeric gender values to string representations in sample_size BEFORE renaming columns
gender_map = {'0': 'Men', '1': 'Women'}

sample_size['gender_W'] = sample_size['gender_W'].astype(str)  # Convert to string
sample_size['gender_W'] = sample_size['gender_W'].map(gender_map)  # Apply mapping

# Rename the 'gender_W' column to 'gender'
sample_size.rename(columns={'gender_W': 'gender'}, inplace=True)

# Perform the merge operation
# Ensure merging on the correct column names and that the data types match
df_gender_count = df_gender_count.merge(sample_size, on='gender', how='left')


Z = 1.96  # Z-score for 95% confidence
df_gender_count['margin_of_error'] = Z * (df_gender_count['std'] / np.sqrt(df_gender_count['n'])) 

# Calculate confidence intervals
df_gender_count['ci_lower'] = df_gender_count['mean'] - df_gender_count['margin_of_error']
df_gender_count['ci_upper'] = df_gender_count['mean'] + df_gender_count['margin_of_error']
df_gender_count

Unnamed: 0,gender,var,mean,std,n,margin_of_error,ci_lower,ci_upper
0,Women,Childcare responsibilities,2.129263,1.154497,3160,0.040254,2.089009,2.169516
1,Men,Childcare responsibilities,1.974328,1.177702,2540,0.045801,1.928527,2.020129
2,Men,Egalitarian beliefs,0.466858,0.965398,2540,0.037544,0.429314,0.504403
3,Women,Egalitarian beliefs,1.298512,0.877616,3160,0.0306,1.267912,1.329112
4,Men,ARCo,1.083716,1.389866,2540,0.054052,1.029664,1.137768
5,Women,ARCo,0.981357,1.1545,3160,0.040254,0.941103,1.021611
6,Women,ARC,2.141185,3.139571,3160,0.109467,2.031718,2.250652
7,Men,ARC,2.277524,3.81489,2540,0.148362,2.129163,2.425886
8,Men,ARP,2.271846,3.030543,2540,0.117858,2.153988,2.389704
9,Women,ARP,1.813911,1.801113,3160,0.062799,1.751112,1.87671


In [12]:
vars = ['Child suffering', 'Mother relationship', 'Mother to work', 'Women to cut work', 'Egalitarian beliefs', 'Childcare responsibilities']
gender_colors = {"Women":'#ffcc00', "Men":'#1f77b4'}

# initialize the figure
fig = make_subplots(
    rows=len(vars), cols=1,
    shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.08,
    horizontal_spacing=0.1
)

for var in vars:
    for gender in ['Women', 'Men']:
        
        # Create a DataFrame for the current variable
        df = df_gender_count.query("var == @var and gender == @gender")
        y_positions = [0.5] * len(df)  # Place bars at y=0.5 for centering
        fig.add_trace(
            go.Bar(
                x=df['mean'],
                y=y_positions,
                orientation='h',
                marker=dict(color=gender_colors[gender]),
            ),
            row=vars.index(var)+1, col=1
        )
        
        if vars.index(var)+1 != len(vars):
            fig.update_xaxes(title=None, showticklabels=False, ticklen=0, showline=False, row=vars.index(var)+1, col=1)
            
            
    # add subplot title
    fig.add_annotation(
        xref="x domain", yref="paper",
        x=-0.2, y=1.5,
        xanchor="left", yanchor="bottom",
        showarrow=False,
        text="<b>"+var,
        font=dict(size=14, family="Arial"),
        row=vars.index(var)+1, col=1
    )

# if odd number of subplots, add a gray square as background
subplot_index = 0
for var in vars:
    if subplot_index % 2 == 0:
        fig.add_shape(
            type="rect",
            xref="paper", yref=f"y{subplot_index+1}",
            x0=0, y0=-0.5, x1=1, y1=1.5,
            fillcolor='lightgrey', opacity=0.5,
            layer="below",
            line=dict(width=0),
        )
    subplot_index += 1
        
fig.update_layout(
    template="simple_white",
    font=dict(size=11, family="Arial"),
    margin=dict(l=50, r=100, b=50, t=50),
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1, # gap between bars of the same location coordinate.
    width=400,
    height=450
)

fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgrey', dtick=0.5)
fig.update_yaxes(
    range=[-0.5, 1.5],   # Adjust range to fit centered bars
    showticklabels=False,  # Hide y-axis tick labels
    showgrid=False,        # Hide gridlines
    ticks="",              # Remove tick marks
    zeroline=False         # Remove the zero line
)
fig.update_xaxes(title=dict(text="Mean", standoff=0), row=len(vars), col=1)
fig.update_yaxes(title=dict(text=None, standoff=0))

fig.show()
                

In [15]:
vars = ['Research satisfaction', 'Career satisfaction', 'Community recognition', 'ARP', 'ARC', 'ARCo']
gender_colors = {"Women":'#ffcc00', "Men":'#1f77b4'}

# initialize the figure
fig = make_subplots(
    rows=len(vars), cols=1,
    shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.08,
    horizontal_spacing=0.1
)

for var in vars:
    for gender in ['Women', 'Men']:
        
        # Create a DataFrame for the current variable
        df = df_gender_count.query("var == @var and gender == @gender")
        y_positions = [0.5] * len(df)  # Place bars at y=0.5 for centering
        fig.add_trace(
            go.Bar(
                x=df['mean'],
                y=y_positions,
                orientation='h',
                marker=dict(color=gender_colors[gender]),
            ),
            row=vars.index(var)+1, col=1
        )
        
        if vars.index(var)+1 != len(vars):
            fig.update_xaxes(title=None, showticklabels=False, ticklen=0, showline=False, row=vars.index(var)+1, col=1)
            
            
    # add subplot title
    fig.add_annotation(
        xref="x domain", yref="paper",
        x=-0.2, y=1.5,
        xanchor="left", yanchor="bottom",
        showarrow=False,
        text="<b>"+var,
        font=dict(size=14, family="Arial"),
        row=vars.index(var)+1, col=1
    )

# if odd number of subplots, add a gray square as background
subplot_index = 0
for var in vars:
    if subplot_index % 2 == 0:
        fig.add_shape(
            type="rect",
            xref="paper", yref=f"y{subplot_index+1}",
            x0=0, y0=-0.5, x1=1, y1=1.5,
            fillcolor='lightgrey', opacity=0.5,
            layer="below",
            line=dict(width=0),
        )
    subplot_index += 1
        
fig.update_layout(
    template="simple_white",
    font=dict(size=11, family="Arial"),
    margin=dict(l=50, r=100, b=50, t=50),
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1, # gap between bars of the same location coordinate.
    width=400,
    height=450
)

fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgrey', dtick=0.5)
fig.update_yaxes(
    range=[-0.5, 1.5],   # Adjust range to fit centered bars
    showticklabels=False,  # Hide y-axis tick labels
    showgrid=False,        # Hide gridlines
    ticks="",              # Remove tick marks
    zeroline=False         # Remove the zero line
)
fig.update_xaxes(title=dict(text="Mean", standoff=0), row=len(vars), col=1)
fig.update_yaxes(title=dict(text=None, standoff=0))

fig.show()
                
                

In [17]:
df_coeffi = pd.read_excel(os.path.join(data_dir,"parental descriptive_sig.xlsx"))
df_coeffi

Unnamed: 0,var,group,coeffi,low_ci,high_ci,P_val
0,childsuffer_ega,nonparent,0.786855,0.622826,0.950885,0.0
1,childsuffer_ega,parent,1.075313,0.979334,1.171292,0.0
2,mo_relation_ega,nonparent,0.577945,0.457807,0.698084,0.0
3,mo_relation_ega,parent,0.735649,0.656656,0.814641,0.0
4,moth_to_work,nonparent,1.044357,0.856349,1.232365,0.0
5,moth_to_work,parent,0.98049,0.876128,1.084852,0.0
6,wocutwork_ega,nonparent,0.177826,0.007805,0.347847,0.04
7,wocutwork_ega,parent,0.375709,0.28528,0.466139,0.0
8,ega_attitude,nonparent,0.649906,0.560733,0.739078,0.0
9,ega_attitude,parent,0.791021,0.736008,0.846033,0.0


In [18]:
# Variables and labels for the subplots
vars = ['childsuffer_ega', 'mo_relation_ega', 'moth_to_work', 'wocutwork_ega', 'rear_load', 'ega_attitude']
labels = ['Child suffering', 'Mother relationship', 'Mother to work', 'Women to cut work', 'Childcare responsibilities','Egalitarian beliefs']
group_colors = {"nonparent": '#1f77b4', "parent": '#ffcc00'}

# Initialize the figure with subplots
fig = make_subplots(
    rows=len(vars), cols=1,
    shared_xaxes=True, vertical_spacing=0.08
)
for var in vars:
    group = 'parent' 
    df = df_coeffi.query("var == @var and group == @group")
    # Calculate center positions for the traces
    y_positions = [0.5] * len(df)  # Place traces at y=0.5 for visual centering
    fig.add_trace(
        go.Scatter(
            x=df['coeffi'],  # Centered at the coefficient value
            y=y_positions,   # Centered at a fixed y-position
            mode='markers',
            # maker color be black
            marker=dict(color = 'black', size=6),
            error_x=dict(
                type='data',
                symmetric=False,
                array=df['high_ci'] - df['coeffi'],
                arrayminus=df['coeffi'] - df['low_ci'],
                thickness=1.5,
                width=3
            ),
            name=f"{group}"
        ),
        row=vars.index(var) + 1, col=1
    )

    # Remove x-axis ticks except for the last subplot
    if vars.index(var)+1 != len(vars):
        fig.update_xaxes(title=None, showticklabels=False, ticklen=0, showline=False, row=vars.index(var)+1, col=1) #showlines=False means the axis line is not shown

# Add gray background for odd subplots
subplot_index = 0
for var in vars:
    if subplot_index % 2 == 0:
        fig.add_shape(
            type="rect",
            xref="paper", yref=f"y{subplot_index+1}",
            x0=0, y0=-0.5, x1=1, y1=1.5,
            fillcolor='lightgrey', opacity=0.5,
            layer="below",
            line=dict(width=0),
        )
    subplot_index += 1

fig.update_layout(
    template="simple_white",
    font=dict(size=11, family="Arial"),
    margin=dict(l=50, r=100, b=50, t=50),
    width=400,
    height=450
)

# Customize x-axis and y-axis
fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgrey', dtick=0.25) #showgrid is True means the gridlines are shown
fig.update_yaxes(range=[-0.5, 1.5])
fig.update_xaxes(title=dict(text="Coefficient (women)", standoff=0), row=len(vars), col=1)
fig.update_yaxes(title=dict(text=None, standoff=0), tickvals=[0, 1], ticktext=['', ''], ticklen=0)

fig.show()



In [19]:
# Variables and labels for the subplots
vars = ['res_ach', 'car_dev', 'contrib', 'rel_npap', 'rel_cite', 'uni_cola']
labels = ['Research achievement', 'Career development', 'Community contribution', 'ARP', 'ARC', 'ARCo']
group_colors = {"nonparent": '#1f77b4', "parent": '#ffcc00'}

fig = make_subplots(
    rows=len(vars), cols=1,
    shared_xaxes=True, vertical_spacing=0.08
)

for var in vars:
    # Filter for "parent" group only
    group = "parent"
    df = df_coeffi.query("var == @var and group == @group")
    
    # Calculate center positions for the traces
    y_positions = [0.5] * len(df)  # Place traces at y=0.5 for visual centering
    
    fig.add_trace(
        go.Scatter(
            x=df['coeffi'],  # Centered at the coefficient value
            y=y_positions,   # Centered at a fixed y-position
            mode='markers',
            marker=dict(color='black', size=6),
            error_x=dict(
                type='data',
                symmetric=False,
                array=df['high_ci'] - df['coeffi'],
                arrayminus=df['coeffi'] - df['low_ci'],
                thickness=1.5,
                width=3
            ),
            name=f"{group}"
        ),
        row=vars.index(var) + 1, col=1
    )

    # Remove x-axis ticks except for the last subplot
    if vars.index(var) + 1 != len(vars):
        fig.update_xaxes(title=None, showticklabels=False, ticklen=0, showline=False, row=vars.index(var)+1, col=1)

# Add gray background for odd subplots
subplot_index = 0
for var in vars:
    if subplot_index % 2 == 0:
        fig.add_shape(
            type="rect",
            xref="paper", yref=f"y{subplot_index+1}",
            x0=0, y0=-0.5, x1=1, y1=1.5,
            fillcolor='lightgrey', opacity=0.5,
            layer="below",
            line=dict(width=0),
        )
    subplot_index += 1

fig.update_layout(
    template="simple_white",
    font=dict(size=11, family="Arial"),
    margin=dict(l=50, r=100, b=50, t=50),
    width=400,
    height=450
)

# Customize x-axis and y-axis
fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgrey', dtick=0.25)
fig.update_yaxes(range=[-0.5, 1.5])
fig.update_xaxes(title=dict(text="Coefficient (women)", standoff=0), row=len(vars), col=1)
fig.update_yaxes(title=dict(text=None, standoff=0), tickvals=[0, 1], ticktext=['', ''], ticklen=0)

fig.show()


In [20]:
odds = pd.read_excel(os.path.join(data_dir,"Odds.xlsx"))
odds

Unnamed: 0,var,Odds,Lower_interval,Upper_interval
0,uni_cola,0.929657,0.831384,1.039547
1,rel_cite,0.935327,0.834543,1.048282
2,rel_npap,0.765614,0.676581,0.866362
3,contrib,0.745081,0.651421,0.852208
4,car_dev,0.850745,0.750026,0.96499
5,res_ach,0.738622,0.652067,0.836666
6,rear_load,1.486256,1.312789,1.682645
7,childsuffer_ega,3.17967,2.805765,3.603404
8,mo_relation_ega,3.109426,2.722332,3.551561
9,moth_to_work,2.773764,2.449056,3.141524


In [21]:
# Variables and labels for the subplots
vars = ['res_ach', 'car_dev', 'contrib', 'rel_npap', 'rel_cite', 'uni_cola']
labels = ['Research achievement', 'Career development', 'Community contribution', 'ARP', 'ARC', 'ARCo']

# Initialize the figure
fig = make_subplots(
    rows=len(vars), cols=1,
    shared_xaxes=True, vertical_spacing=0.08
)

for var in vars:
    # Filter for the current variable
    df = odds.query("var == @var")
    
    # Calculate center positions for the traces
    y_positions = [0.5] * len(df)  # Place traces at y=0.5 for visual centering
    
    fig.add_trace(
        go.Scatter(
            x=df['Odds'],  # Use the Odds column for x-axis values
            y=y_positions,  # Centered at a fixed y-position
            mode='markers',
            marker=dict(color='black', size=6),
            error_x=dict(
                type='data',
                symmetric=False,
                array=df['Upper_interval'] - df['Odds'],  # Error above the marker
                arrayminus=df['Odds'] - df['Lower_interval'],  # Error below the marker
                thickness=1.5,
                width=3
            ),
            name=var
        ),
        row=vars.index(var) + 1, col=1
    )

    # Remove x-axis ticks except for the last subplot
    if vars.index(var) + 1 != len(vars):
        fig.update_xaxes(
            title=None, showticklabels=False, ticklen=0, showline=False,
            row=vars.index(var) + 1, col=1
        )

# Add gray background for odd subplots
subplot_index = 0
for var in vars:
    if subplot_index % 2 == 0:
        fig.add_shape(
            type="rect",
            xref="paper", yref=f"y{subplot_index+1}",
            x0=0, y0=-0.5, x1=1, y1=1.5,
            fillcolor='lightgrey', opacity=0.5,
            layer="below",
            line=dict(width=0),
        )
    subplot_index += 1

fig.update_layout(
    template="simple_white",
    font=dict(size=11, family="Arial"),
    margin=dict(l=50, r=100, b=50, t=50),
    width=400,
    height=450
)

# Customize x-axis and y-axis
fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgrey', dtick=0.25)
fig.update_yaxes(range=[-0.5, 1.5])
fig.update_xaxes(title=dict(text="Odds (women)", standoff=0), row=len(vars), col=1)
fig.update_yaxes(title=dict(text=None, standoff=0), tickvals=[0, 1], ticktext=['', ''], ticklen=0)

fig.show()


In [22]:
# Variables and labels for the subplots
vars = ['res_ach', 'car_dev', 'contrib', 'rel_npap', 'rel_cite', 'rear_load']
labels = ['Research achievement', 'Career development', 'Community contribution', 'ARP', 'ARC', 'Childcare responsibilities' ]

# Initialize the figure
fig = make_subplots(
    rows=len(vars), cols=1,
    shared_xaxes=True, vertical_spacing=0.08
)

for var in vars:
    # Filter for the current variable
    df = odds.query("var == @var")
    
    # Calculate center positions for the traces
    y_positions = [0.5] * len(df)  # Place traces at y=0.5 for visual centering
    
    fig.add_trace(
        go.Scatter(
            x=df['Odds'],  # Use the Odds column for x-axis values
            y=y_positions,  # Centered at a fixed y-position
            mode='markers',
            marker=dict(color='black', size=6),
            error_x=dict(
                type='data',
                symmetric=False,
                array=df['Upper_interval'] - df['Odds'],  # Error above the marker
                arrayminus=df['Odds'] - df['Lower_interval'],  # Error below the marker
                thickness=1.5,
                width=3
            ),
            name=var
        ),
        row=vars.index(var) + 1, col=1
    )

    # Remove x-axis ticks except for the last subplot
    if vars.index(var) + 1 != len(vars):
        fig.update_xaxes(
            title=None, showticklabels=False, ticklen=0, showline=False,
            row=vars.index(var) + 1, col=1
        )

# Add gray background for odd subplots
subplot_index = 0
for var in vars:
    if subplot_index % 2 == 0:
        fig.add_shape(
            type="rect",
            xref="paper", yref=f"y{subplot_index+1}",
            x0=0, y0=-0.5, x1=1, y1=1.5,
            fillcolor='lightgrey', opacity=0.5,
            layer="below",
            line=dict(width=0),
        )
    subplot_index += 1

fig.update_layout(
    template="simple_white",
    font=dict(size=11, family="Arial"),
    margin=dict(l=50, r=100, b=50, t=50),
    width=400,
    height=450
)

# Customize x-axis and y-axis
fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgrey', dtick=0.25)
fig.update_yaxes(range=[-0.5, 1.5])
fig.update_xaxes(title=dict(text="Odds (women)", standoff=0), row=len(vars), col=1)
fig.update_yaxes(title=dict(text=None, standoff=0), tickvals=[0, 1], ticktext=['', ''], ticklen=0)

fig.show()


In [23]:
# Variables and labels for the subplots
vars = ['childsuffer_ega', 'mo_relation_ega', 'moth_to_work', 'wocutwork_ega', 'uni_cola', 'ega_attitude']
labels = ['Child suffering', 'Mother relationship', 'Mother to work', 'Women to cut work', 'uni_cola', 'Egalitarian beliefs']

# Initialize the figure with subplots
fig = make_subplots(
    rows=len(vars), cols=1,
    shared_xaxes=True, vertical_spacing=0.08
)

for var in vars:
    # Filter for the current variable
    df = odds.query("var == @var")
    
    # Calculate center positions for the traces
    y_positions = [0.5] * len(df)  # Place traces at y=0.5 for visual centering
    
    fig.add_trace(
        go.Scatter(
            x=df['Odds'],  # Use the Odds column for x-axis values
            y=y_positions,  # Centered at a fixed y-position
            mode='markers',
            marker=dict(color='black', size=6),  # Marker color set to black
            error_x=dict(
                type='data',
                symmetric=False,
                array=df['Upper_interval'] - df['Odds'],  # Upper error
                arrayminus=df['Odds'] - df['Lower_interval'],  # Lower error
                thickness=1.5,
                width=3
            ),
            name="Parent"
        ),
        row=vars.index(var) + 1, col=1
    )

    # Remove x-axis ticks except for the last subplot
    if vars.index(var) + 1 != len(vars):
        fig.update_xaxes(
            title=None, showticklabels=False, ticklen=0, showline=False,
            row=vars.index(var) + 1, col=1
        )

# Add gray background for odd subplots
subplot_index = 0
for var in vars:
    if subplot_index % 2 == 0:
        fig.add_shape(
            type="rect",
            xref="paper", yref=f"y{subplot_index+1}",
            x0=0, y0=-0.5, x1=1, y1=1.5,
            fillcolor='lightgrey', opacity=0.5,
            layer="below",
            line=dict(width=0),
        )
    subplot_index += 1

fig.update_layout(
    template="simple_white",
    font=dict(size=11, family="Arial"),
    margin=dict(l=50, r=100, b=50, t=50),
    width=400,
    height=450
)

# Customize x-axis and y-axis
fig.update_xaxes(
    showgrid=True, gridwidth=0.5, gridcolor='lightgrey', dtick=2.5
)
fig.update_yaxes(
    range=[-0.5, 1.5], showticklabels=False, ticklen=0, zeroline=False
)
fig.update_xaxes(
    title=dict(text="Odds (Women)", standoff=0), row=len(vars), col=1
)

fig.show()


In [43]:
df_ega_sup = pd.read_excel(os.path.join(data_dir,"ega_support.xlsx"))
df_ega_sup

Unnamed: 0,var,group,gender,Coeff,Lo_CI,Up_CI,P_val
0,egalitarian_beliefs,Total,men,-0.037,-0.082,0.009,0.118
1,egalitarian_beliefs,partner_work,men,-0.032,-0.086,0.023,0.252
2,egalitarian_beliefs,partner_Self_Stu,men,-0.108,-0.236,0.021,0.101
3,egalitarian_beliefs,partner_no_work,men,0.083,-0.047,0.213,0.207
4,egalitarian_beliefs,partner_academic,men,-0.074,-0.159,0.012,0.091
5,egalitarian_beliefs,partner_no_academic,men,-0.021,-0.078,0.036,0.47
6,egalitarian_beliefs,Total,women,-0.149,-0.196,-0.103,0.0
7,egalitarian_beliefs,partner_work,women,-0.117,-0.172,-0.062,0.0
8,egalitarian_beliefs,partner_Self_Stu,women,-0.278,-0.401,-0.155,0.0
9,egalitarian_beliefs,partner_no_work,women,-0.242,-0.541,0.058,0.112


In [35]:
genders = ['men', 'women']
colors = ['#1f77b4', '#ffcc00']
xlabels = ['Total', 'Employed', 'Self-employed/student', 'Unemployed', 'Research-related job', 'Non-research job']

for i, gender in enumerate(genders):
    fig = go.Figure()
    df = df_ega_sup.query("gender == @gender")
    fig.add_trace(
        go.Scatter(
            x=df['group'],
            y=df['Coeff'],
            mode='markers',
            marker=dict(color=colors[i]),
            name=genders[i].capitalize(),
            opacity=0.8,
            showlegend=True,  # Force the legend to be displayed
            error_y=dict(
                type='data',
                symmetric=False,
                array=df['Up_CI'] - df['Coeff'],
                arrayminus=df['Coeff'] - df['Lo_CI'],
                thickness=1.5,
                width=5
            ),
        ))
        
    fig.update_layout(
        template="simple_white",
        font=dict(size=11, family="Arial"),
        margin=dict(l=50, r=100, b=50, t=50),
        width=300,
        height=300,
        legend=dict(
            orientation="h",  # Horizontal legend
            yanchor="bottom",  # Align legend to the bottom of the plot area
            y=1.05,  # Position the legend just above the plot
            xanchor="center",  # Center the legend horizontally
            x=0.5  # Horizontal center
        )
    )

    fig.update_xaxes(title=dict(text="Egalitarian beliefs", standoff=0), tickvals=np.arange(6), ticktext=xlabels,)
    fig.update_yaxes(title=dict(text="Change in child-rearing load", standoff=0), zeroline=True, zerolinewidth=0.5, zerolinecolor='lightgrey')
    fig.show()

In [10]:
#var lists
outcomes = ["res_ach_7", "car_dev_7", "contrib_7", "rel_npap", "rel_cite", "uni_cola"]
supports = ["Cdcare", "MatLeave", "PauTen", "FlexSche"]
genders = [0,1]
partner_jobsta =[1,2,3]
partner_academi =[1,2]
conditions = ["partner_jobsta == 1", "partner_jobsta == 2", "partner_jobsta == 3", "partner_academi == 1", "partner_academi == 2"]

## The moderating effect of egalitarian gender role beliefs

In [18]:
# subsample by partners' job status and academic type
ts = []
margin_results = []
for condition in conditions:
    print(condition)
    t = parental_df.query(condition)
    if condition in ['partner_jobsta == 1', 'partner_jobsta == 2', 'partner_jobsta == 3']:
        control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_academi2 partner_academi3"
    else:
        control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_jobsta2 partner_jobsta3 partner_jobsta4"
    command = f"reg rear_load ega_attitude gender_W c.gender_W#c.ega_attitude  {control} if (married_child == 3 | married_child == 1), vce(cluster mail_school)"
    stata.run("clear")
    stata.pdataframe_to_data(t)
    stata.run(command)
    
    # collect results
    results = stata.get_return()["r(table)"].T
    results = pd.DataFrame(results).iloc[0:3, :6]
    results.columns = ["coef", "se", "t", "p", "ci_low", "ci_high"]
    results["condition"] = condition
    results['var'] = ['ega_attitude', 'gender', 'c.gender#c.ega_attitude']
    ts.append(results)
    
    command = f"margins, at(ega_attitude = (-.077886 .9285294 1.934945)) dydx(gender_W)"
    stata.run(command)
    results = stata.get_return()["r(table)"].T
    results = pd.DataFrame(results).iloc[0:3, :6]
    results.columns = ["dy/dx", "se", "t", "p", "ci_low", "ci_high"]
    results["condition"] = condition
    results['var'] = ["Mean-1sd", "Mean", "Mean+1sd"]
    margin_results.append(results)
    print(results)
ts = pd.concat(ts)
margin_results = pd.concat(margin_results)

partner_jobsta == 1

Linear regression                               Number of obs     =      4,075
                                                F(14, 535)        =       4.90
                                                Prob > F          =     0.0000
                                                R-squared         =     0.0147
                                                Root MSE          =     1.1402

                          (Std. err. adjusted for 536 clusters in mail_school)
------------------------------------------------------------------------------
             |               Robust
   rear_load | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
ega_attitude |  -.0258393   .0286535    -0.90   0.368    -.0821264    .0304478
    gender_W |   .2793326   .0614341     4.55   0.000      .158651    .4000142
             |
  c.gender_W#|
          c. |
ega_attitude |  -.0931549   

In [19]:
#put column "var" in the first place
cols = ts.columns.tolist()
cols = cols[-1:] + cols[:-1]
ts = ts[cols]

#Generate a new column
ts['group_id'] = ts.groupby(['condition']).ngroup() + 1

# Ensure group_id is sequential by sorting
ts = ts.sort_values(by=['group_id']).reset_index(drop=False)

#resort the sequence of rows by index in each group
ts = ts.sort_values(by=['group_id', 'index']).reset_index(drop=True)
ts.to_excel(os.path.join(data_dir,"Q2_reg_subsample.xlsx"), index=False)

In [20]:
margin_results
margin_results.to_excel(os.path.join(data_dir,"Q2_margin_subsample.xlsx"), index=False)

## The moderating effect of institutional parental support 

In [13]:
# subsample by partners' job status and academic type
ts = []
margin_results = []

for gender in genders:
    for condition in conditions:
        for outcome in outcomes:
            for support in supports:
                print(gender, condition, outcome, support)
                t = parental_df.query("gender_W == @gender").query(condition)
                
                if condition in ['partner_jobsta == 1', 'partner_jobsta == 2', 'partner_jobsta == 3']:
                    control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_academi2 partner_academi3"
                else:
                    control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_jobsta2 partner_jobsta3 partner_jobsta4"
                    

                command = f"reg {outcome} rear_load {support} c.rear_load#c.{support}  {control} if (married_child == 3 | married_child == 1), vce(cluster mail_school)"

                stata.run("clear")
                stata.pdataframe_to_data(t)
                stata.run(command)
                
                # collect results
                results = stata.get_return()["r(table)"].T
                results = pd.DataFrame(results).iloc[0:3, :6]
                results.columns = ["coef", "se", "t", "p", "ci_low", "ci_high"]
                results["gender"] = gender
                results["condition"] = condition
                results["outcome"] = outcome
                results["support"] = support
                results['var'] = ['rear_load', support, f'rear_load#{support}']
                ts.append(results)
                
                command = f"margins, dydx(rear_load) at({support}=(0 1)) post"
                stata.run(command)
                results = stata.get_return()["r(table)"].T
                results = pd.DataFrame(results).iloc[0:2, :6]
                results.columns = ["dy/dx", "se", "t", "p", "ci_low", "ci_high"]
                results["gender"] = gender
                results["condition"] = condition
                results["outcome"] = outcome
                results["support"] = support
                results['var'] = [0, 1]
                margin_results.append(results)
                print(results)
ts = pd.concat(ts)
margin_results = pd.concat(margin_results)

0 partner_jobsta == 1 res_ach_7 Cdcare

Linear regression                               Number of obs     =      1,612
                                                F(14, 384)        =       9.36
                                                Prob > F          =     0.0000
                                                R-squared         =     0.0753
                                                Root MSE          =      1.611

                          (Std. err. adjusted for 385 clusters in mail_school)
------------------------------------------------------------------------------
             |               Robust
   res_ach_7 | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
   rear_load |  -.0186846   .0387666    -0.48   0.630    -.0949061    .0575368
      Cdcare |   .3022453    .218014     1.39   0.166    -.1264054    .7308959
             |
 c.rear_load#|
    c.Cdcare |  -.122591

In [33]:
ts
#put column "var" in the first place
cols = ts.columns.tolist()
cols = cols[-1:] + cols[:-1]
ts = ts[cols]

#Generate a new column
ts['group_id'] = ts.groupby(['gender', 'condition', 'outcome', 'support']).ngroup() + 1

# Ensure group_id is sequential by sorting
ts = ts.sort_values(by=['group_id']).reset_index(drop=False)

#resort the sequence of rows by index in each group
ts = ts.sort_values(by=['group_id', 'index']).reset_index(drop=True)
ts.to_excel(os.path.join(data_dir,"Q3_reg_subsample.xlsx"), index=False)

In [32]:
margin_results
margin_results.to_excel(os.path.join(data_dir,"Q3_margin_subsample.xlsx"), index=False)

In [36]:
# partner_jobsta = 1 if "Employed for wages" | "Military"
# partner_jobsta = 2 if "Self-employed" | "Student) | "Out of work and looking for work"
# partner_jobsta = 3 if "Out of work but not looking for work" | "Retired"
# partner_academi = 1 if partner is in research job
# partner_academi = 2 if No
# Filter rows where `index` is 1 or 2 and `p` is less than 0.05
matching_groups = ts.loc[
    (ts['index']==2) & (ts['p'] < 0.05),
    ['gender', 'condition', 'outcome', 'support']
].drop_duplicates()

# Filter the original DataFrame to include all rows from these groups
filter_inter = ts.merge(matching_groups, on=['gender', 'condition', 'outcome', 'support'])

# Show the result
filter_inter

# to_excel
filter_inter.to_excel(os.path.join(data_dir,"Q3_filter_inter_subsample_new.xlsx"), index=False)

In [41]:
# Total sample

result_CE = []
result_Margin = []

for gender in genders:
    for outcome in outcomes:
        for support in supports:
            print(gender, outcome, support)
            t = parental_df.query("gender_W == @gender")
            control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_academi2 partner_academi3 partner_jobsta2 partner_jobsta3 partner_jobsta4"
            command = f"reg {outcome} rear_load {support} c.rear_load#c.{support}  {control} if (married_child == 3 | married_child == 1), vce(cluster mail_school)"

            stata.run("clear")
            stata.pdataframe_to_data(t)
            stata.run(command)
            
            # collect results
            results = stata.get_return()["r(table)"].T
            results = pd.DataFrame(results).iloc[0:3, :6]
            results.columns = ["coef", "se", "t", "p", "ci_low", "ci_high"]
            results["gender"] = gender
            results["outcome"] = outcome
            results["support"] = support
            results['var'] = ['rear_load', support, f'rear_load#{support}']
            result_CE.append(results)
            
            command = f"margins, dydx(rear_load) at({support}=(0 1)) post"
            stata.run(command)
            results = stata.get_return()["r(table)"].T
            results = pd.DataFrame(results).iloc[0:2, :6]
            results.columns = ["dy/dx", "se", "t", "p", "ci_low", "ci_high"]
            results["gender"] = gender
            results["outcome"] = outcome
            results["support"] = support
            results['var'] = [0, 1]
            result_Margin.append(results)
            print(results)
result_CE = pd.concat(result_CE)
result_Margin = pd.concat(result_Margin)

0 res_ach_7 Cdcare

Linear regression                               Number of obs     =      2,366
                                                F(17, 441)        =       9.56
                                                Prob > F          =     0.0000
                                                R-squared         =     0.0772
                                                Root MSE          =     1.6186

                          (Std. err. adjusted for 442 clusters in mail_school)
------------------------------------------------------------------------------
             |               Robust
   res_ach_7 | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
   rear_load |  -.0372154   .0308279    -1.21   0.228    -.0978033    .0233725
      Cdcare |   .3034484   .1833436     1.66   0.099    -.0568874    .6637841
             |
 c.rear_load#|
    c.Cdcare |  -.1420594   .0879119    -1.6

In [43]:
#put column "var" in the first place
cols = result_CE.columns.tolist()
cols = cols[-1:] + cols[:-1]
result_CE = result_CE[cols]

#Generate a new column
result_CE['group_id'] = result_CE.groupby(['gender', 'outcome', 'support']).ngroup() + 1

# Ensure group_id is sequential by sorting
result_CE = result_CE.sort_values(by=['group_id']).reset_index(drop=False)

#resort the sequence of rows by index in each group
result_CE = result_CE.sort_values(by=['group_id', 'index']).reset_index(drop=True)
result_CE.to_excel(os.path.join(data_dir,"Q3_reg_total_sample.xlsx"), index=False)

In [42]:
result_Margin.to_excel(os.path.join(data_dir,"Q3_margin_total_sample.xlsx"), index=False)

In [47]:
# read excel
df = pd.read_excel(os.path.join(data_dir,"Q3_margin_total_sample.xlsx"))
df

Unnamed: 0,dy/dx,se,t,p,ci_low,ci_high,gender,outcome,support,var
0,-0.037215,0.030828,-1.207197,0.228003,-0.097803,0.023373,0,res_ach_7,Cdcare,0
1,-0.179275,0.080916,-2.215573,0.027231,-0.338303,-0.020246,0,res_ach_7,Cdcare,1
2,-0.036838,0.031143,-1.182876,0.237496,-0.098044,0.024369,0,res_ach_7,MatLeave,0
3,-0.179462,0.070924,-2.530321,0.011743,-0.318853,-0.040070,0,res_ach_7,MatLeave,1
4,-0.059493,0.032330,-1.840197,0.066411,-0.123032,0.004046,0,res_ach_7,PauTen,0
...,...,...,...,...,...,...,...,...,...,...
91,-0.051569,0.029764,-1.732593,0.083877,-0.110069,0.006930,1,uni_cola,MatLeave,1
92,-0.021589,0.026920,-0.801946,0.423022,-0.074498,0.031321,1,uni_cola,PauTen,0
93,-0.050046,0.033885,-1.476957,0.140411,-0.116644,0.016552,1,uni_cola,PauTen,1
94,-0.018463,0.027212,-0.678491,0.497821,-0.071946,0.035020,1,uni_cola,FlexSche,0


## The mediating effect of childcare responsibility on gender gaps in academic achievements

In [17]:
result_SEM = []
result_Media = []
var_list = ["gender_W", "rear_load", "gender_W"]
for outcome in outcomes:
    print(outcome)
    control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_academi2 partner_academi3 partner_jobsta2 partner_jobsta3 partner_jobsta4"
    command = f"bootstrap, reps(5000) seed(1234): sem (gender_W {control} -> rear_load,) (rear_load gender_W {control} -> {outcome},) if  (married_child == 3 | married_child == 1) & valid_sample == 1, vce(cluster mail_school) nocapslatent"
    stata.run("clear")
    stata.pdataframe_to_data(parental_df)
    stata.run(command)
    # collect results
    results = stata.get_return()["r(table)"].T
    results = pd.DataFrame(results).iloc[[0, 16, 17], :6]
    results.columns = ["coef", "se", "z", "p", "ci_low", "ci_high"]
    results["outcome"] = outcome
    results["var"] = var_list
    result_SEM.append(results)
    
    command = f"nlcom (_b[rear_load:gender_W])*(_b[{outcome}:rear_load])"
    stata.run(command)
  
result_SEM = pd.concat(result_SEM)

res_ach_7
(running sem on estimation sample)

Bootstrap replications (5,000)
----+--- 1 ---+--- 2 ---+--- 3 ---+--- 4 ---+--- 5 
..................................................    50
..................................................   100
..................................................   150
..................................................   200
..................................................   250
..................................................   300
..................................................   350
..................................................   400
..................................................   450
..................................................   500
..................................................   550
..................................................   600
..................................................   650
..................................................   700
..................................................   750
................

In [None]:
    command = f"nlcom (_b[rear_load:gender_W])*(_b[{outcome}:rear_load])"
    stata.run(command)
    results = stata.get_return()["e(table)"].T
    results = pd.DataFrame(results).iloc[0, :6]
    results.columns = ["Coef", "se", "z", "p", "ci_low", "ci_high"]
    results["outcome"] = outcome
    results["var"] = ["mediating effect"]
    result_Media.append(results)
    print(results)
    break

In [18]:
result_SEM

Unnamed: 0,coef,se,z,p,ci_low,ci_high,outcome,var
0,0.121416,0.033055,3.673085,0.0002396401,0.056628,0.186203,res_ach_7,gender_W
16,-0.083147,0.021334,-3.897416,9.722459e-05,-0.12496,-0.041333,res_ach_7,rear_load
17,-0.25277,0.051026,-4.953708,7.281263e-07,-0.352779,-0.15276,res_ach_7,gender_W
0,0.121416,0.033055,3.673085,0.0002396401,0.056628,0.186203,car_dev_7,gender_W
16,-0.103592,0.019992,-5.181742,2.198226e-07,-0.142775,-0.064409,car_dev_7,rear_load
17,-0.065743,0.046791,-1.405036,0.1600106,-0.157452,0.025966,car_dev_7,gender_W
0,0.121416,0.033055,3.673085,0.0002396401,0.056628,0.186203,contrib_7,gender_W
16,-0.083395,0.017758,-4.696264,2.649635e-06,-0.1182,-0.048591,contrib_7,rear_load
17,-0.168895,0.042276,-3.995052,6.468006e-05,-0.251754,-0.086035,contrib_7,gender_W
0,0.121416,0.033055,3.673085,0.0002396401,0.056628,0.186203,rel_npap,gender_W


In [19]:
result_SEM.to_excel(os.path.join(data_dir,"Q3_SEM_total_sample.xlsx"), index=False)

In [9]:
# SEM for different partner job status
condition1 = ['partner_jobsta == 1', 'partner_jobsta == 2', 'partner_jobsta == 3']
result_SEM_status = []
var_list = ["gender_W", "rear_load", "gender_W"]
for outcome in outcomes:
    for condition in condition1:
        print(condition, outcome)
        t = parental_df.query(condition)
        control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_academi2 partner_academi3"
        command = f"bootstrap, reps(5000) seed(1234): sem (gender_W {control} -> rear_load,) (rear_load gender_W {control} -> {outcome},) if  (married_child == 3 | married_child == 1) & valid_sample == 1, vce(cluster mail_school) nocapslatent"
        stata.run("clear")
        stata.pdataframe_to_data(t)
        stata.run(command)
        # collect results
        results = stata.get_return()["r(table)"].T
        results = pd.DataFrame(results).iloc[[0, 13, 14], :6]
        results.columns = ["coef", "se", "z", "p", "ci_low", "ci_high"]
        results["outcome"] = outcome
        results["condition"] = condition
        results["var"] = var_list
        result_SEM_status.append(results)

        command = f"nlcom (_b[rear_load:gender_W])*(_b[{outcome}:rear_load])"
        stata.run(command)

result_SEM_status = pd.concat(result_SEM_status)


partner_jobsta == 1 res_ach_7
(running sem on estimation sample)

Bootstrap replications (5,000)
----+--- 1 ---+--- 2 ---+--- 3 ---+--- 4 ---+--- 5 
..................................................    50
..................................................   100
..................................................   150
..................................................   200
..................................................   250
..................................................   300
..................................................   350
..................................................   400
..................................................   450
..................................................   500
..................................................   550
..................................................   600
..................................................   650
..................................................   700
..................................................   

In [16]:
# to excel
result_SEM_status.to_excel(os.path.join(data_dir,"Q3_SEM_status.xlsx"), index=False)

In [17]:
# SEM for different partner academic type
condition2 = ['partner_academi == 1', 'partner_academi == 2']
result_SEM_type = []
var_list = ["gender_W", "rear_load", "gender_W"]
for outcome in outcomes:
    for condition in condition2:
        print(condition, outcome)
        t = parental_df.query(condition)
        control = "Art Med Soc Interdisc rank_Ear rank_Mid rank_Lat is_white child_num partner_jobsta2 partner_jobsta3 partner_jobsta4"
        command = f"bootstrap, reps(5000) seed(1234): sem (gender_W {control} -> rear_load,) (rear_load gender_W {control} -> {outcome},) if  (married_child == 3 | married_child == 1) & valid_sample == 1, vce(cluster mail_school) nocapslatent"
        stata.run("clear")
        stata.pdataframe_to_data(t)
        stata.run(command)
        # collect results
        results = stata.get_return()["r(table)"].T
        results = pd.DataFrame(results).iloc[[0, 14, 15], :6]
        results.columns = ["coef", "se", "z", "p", "ci_low", "ci_high"]
        results["outcome"] = outcome
        results["condition"] = condition
        results["var"] = var_list
        result_SEM_type.append(results)

        command = f"nlcom (_b[rear_load:gender_W])*(_b[{outcome}:rear_load])"
        stata.run(command)

result_SEM_type = pd.concat(result_SEM_type)

partner_academi == 1 res_ach_7
(running sem on estimation sample)

Bootstrap replications (5,000)
----+--- 1 ---+--- 2 ---+--- 3 ---+--- 4 ---+--- 5 
..................................................    50
..................................................   100
..................................................   150
..................................................   200
..................................................   250
..................................................   300
..................................................   350
..................................................   400
..................................................   450
..................................................   500
..................................................   550
..................................................   600
..................................................   650
..................................................   700
..................................................  

In [18]:
# to excel
result_SEM_type.to_excel(os.path.join(data_dir,"Q3_SEM_type.xlsx"), index=False)

In [48]:
# Create a subplot figure
fig = sp.make_subplots(rows=6, cols=4, shared_yaxes=True,
                       vertical_spacing=0.05, horizontal_spacing=0.05,
                       column_titles=["Childcare Support", "Paternity Leave", "Paused Tenure", "Flexible Schedule"])

# Define support categories for subplots
support_categories = ["Cdcare", "MatLeave", "PauTen", "FlexSche"]
outcome_categories = ["res_ach_7", "car_dev_7", "contrib_7", "rel_npap", "rel_cite", "uni_cola"]
outcome_text = ["Research<br>Satisfaction", "Career<br>Satisfaction", "Community<br>Recognition", "ARP", "ARC", "ARCo"]


# Add traces for each subplot
for o, outcome in enumerate(outcome_categories):
    for i, support in enumerate(support_categories):
        for gender in [0, 1]:
            # Filter data by support and gender
            plot_data = df[(df['support'] == support) & (df['gender'] == gender) & (df['outcome'] == outcome)]
            label = "Men" if gender == 0 else "Women"
            color = "#1f77b4" if gender == 0 else "#ffcc00"
            
            # Add line with markers
            fig.add_trace(go.Scatter(
                x=["No", "Yes"],  # Replace var 0 and 1
                y=plot_data["dy/dx"],
                mode='lines+markers',
                showlegend=False,  # Do not show in legend
                name=f"{label} ({support})",
                line=dict(width=2, color=color),
                marker=dict(size=8, color=color)
            ), row=o + 1, col=i + 1)
            
            # Add confidence interval as error bars
            fig.add_trace(go.Scatter(
                x=["No", "Yes"],
                y=plot_data["dy/dx"],
                mode='markers',
                error_y=dict(
                    type='data',
                    symmetric=False,
                    array=plot_data["ci_high"] - plot_data["dy/dx"],
                    arrayminus=plot_data["dy/dx"] - plot_data["ci_low"],
                    thickness=1.5,
                    width=3,
                    color=color
                ),
                showlegend=False,  # Do not show in legend
                name=f"{label} CI ({support})",
                marker=dict(size=8, opacity=0)  # Hide markers for CI
            ), row=o + 1, col=i + 1)
            
            if i == 0:
                fig.update_yaxes(title=dict(text=outcome_text[o]), row=o + 1, col=1)
                
            fig.update_yaxes(ticklen=0, zeroline=True, zerolinecolor="lightgray", row=o + 1, col=i + 1)
            
# Add dummy traces for legend
fig.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='lines',
    name="Men",
    line=dict(width=2, color="#1f77b4")
))

fig.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='lines',
    name="Women",
    line=dict(width=2, color="#ffcc00")
))

# Update layout
fig.update_layout(
    font=dict(size=11, family="Arial"),
    xaxis_title="",
    template="simple_white",
    showlegend=True,
    height=800,
    width=700
)
    
fig.show()


In [None]:
# Define readable mappings
gender_map = {0: "Men", 1: "Women"}
area_map = {1: "Arts & Humanities", 2: "Medical Sciences", 3: "Natural Science & Engineering", 4: "Social Sciences", 5: "Interdisciplinary"}

# Apply filter for specific married_child categories
filtered_df = parental_df[parental_df["married_child"].isin([1, 3])].copy()

# Map gender and discipline names
filtered_df["Gender"] = filtered_df["gender_W"].map(gender_map)
filtered_df["Discipline"] = filtered_df["area_new"].map(area_map)

# Ensure Discipline follows the given order
discipline_order = list(area_map.values())
filtered_df["Discipline"] = pd.Categorical(filtered_df["Discipline"], categories=discipline_order, ordered=True)

# Define the correct rank order
rank_order = ["Trainee", "Early Career", "Middle Career", "Late Career"]
filtered_df["rank_n"] = pd.Categorical(filtered_df["rank_n"], categories=rank_order, ordered=True)

# Create a pivot table for absolute counts
count_table = filtered_df.pivot_table(index=["Discipline", "Gender"], 
                                      columns="rank_n", 
                                      aggfunc="size", 
                                      fill_value=0)

# Add a total count per Discipline × Gender
count_table["Total"] = count_table.sum(axis=1)

# Calculate the percentage distribution per (Discipline, rank_n)
percentage_table = count_table.div(count_table.groupby("Discipline").transform("sum")) * 100

# Convert counts and percentages to string format
formatted_table = count_table.astype(str)  
percentage_table = percentage_table.round(1).astype(str)

# Merge both tables for a structured display
for col in rank_order + ["Total"]:  # Ensure percentages align with count columns
    formatted_table[col + " (%)"] = percentage_table[col]

# **Reorder columns to follow the desired sequence**
ordered_columns = []
for col in rank_order:
    ordered_columns.append(col)
    ordered_columns.append(col + " (%)")

# Append "Total" at the end
ordered_columns.append("Total")
ordered_columns.append("Total (%)")

# Reorder the DataFrame columns
formatted_table = formatted_table[ordered_columns]

In [50]:
formatted_table

Unnamed: 0_level_0,rank_n,Trainee,Trainee (%),Early Career,Early Career (%),Middle Career,Middle Career (%),Late Career,Late Career (%),Total,Total (%)
Discipline,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Arts & Humanities,Men,2,50.0,10,27.8,41,29.7,98,49.0,151,39.9
Arts & Humanities,Women,2,50.0,26,72.2,97,70.3,102,51.0,227,60.1
Medical Sciences,Men,16,29.1,84,30.9,119,31.2,221,50.6,440,38.4
Medical Sciences,Women,39,70.9,188,69.1,262,68.8,216,49.4,705,61.6
Natural Science & Engineering,Men,66,49.6,111,53.1,317,56.2,527,65.5,1021,59.7
Natural Science & Engineering,Women,67,50.4,98,46.9,247,43.8,278,34.5,690,40.3
Social Sciences,Men,16,20.3,101,30.2,221,30.9,430,49.3,768,38.4
Social Sciences,Women,63,79.7,233,69.8,495,69.1,442,50.7,1233,61.6
Interdisciplinary,Men,19,33.3,30,26.5,49,34.0,61,40.7,159,34.3
Interdisciplinary,Women,38,66.7,83,73.5,95,66.0,89,59.3,305,65.7


In [71]:
# Define readable mappings
gender_map = {0: "Men", 1: "Women"}

# Apply filter for specific married_child categories
filtered_df = parental_df[parental_df["married_child"].isin([1, 3])].copy()

# Map gender names
filtered_df["Gender"] = filtered_df["gender_W"].map(gender_map)

# Create a pivot table with `partner_jobsta` as index and gender counts as columns
count_table = filtered_df.pivot_table(index="partner_jobsta", 
                                      columns="Gender", 
                                      aggfunc="size", 
                                      fill_value=0)

# Add a total count per `partner_jobsta`
count_table["Total"] = count_table.sum(axis=1)


In [74]:
# Define readable mappings
gender_map = {0: "Men", 1: "Women"}

# Apply filter for specific married_child categories
filtered_df = parental_df[parental_df["married_child"].isin([1, 3])].copy()

# Map gender names
filtered_df["Gender"] = filtered_df["gender_W"].map(gender_map)

# Create a pivot table with `partner_jobsta` as index and gender counts as columns
count_table = filtered_df.pivot_table(index="partner_academi", 
                                      columns="Gender", 
                                      aggfunc="size", 
                                      fill_value=0)

# Add a total count per `partner_jobsta`
count_table["Total"] = count_table.sum(axis=1)

In [75]:
count_table

Gender,Men,Women,Total
partner_academi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,789,1104,1893
2.0,1717,2005,3722
3.0,28,22,50
