# Inputs & outputs
- **Inputs:** `final_data/raw_data_encrypted_final.csv.zip` and `final_data/AI_010_data_uq.parquet` with user demographics and AI usage.
- **Outputs:** gender and experience comparisons (tables and plots) that feed the corresponding figure panels.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pyfixest as pf

In [None]:



custom_style = {
    # Font sizes
    "axes.labelsize": 25,
    "axes.titlesize": 20,
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,

    # Line and marker styles
    "lines.linewidth": 3,
    "lines.markersize": 8,
    "lines.color": "black",
    "errorbar.capsize": 5,

    # Axes & spines
    "axes.edgecolor": "black",
    "axes.linewidth": 2,

    # Tick styling
    "xtick.color": "black",
    "ytick.color": "black",
    "xtick.major.width": 1.2,
    "ytick.major.width": 1.2,

    # Grid
    "axes.grid": True,
    "grid.color": "gray",
    "grid.linewidth": 0.7,
    "grid.linestyle": "--",
    "grid.alpha": 0.6,

    # Figure settings
    "figure.figsize": (12, 8),
    "figure.dpi": 300,
    "figure.facecolor": "white"
}


plt.rcParams.update(custom_style)
path_to_data="./final_data"

In [None]:


# =============================================================================
# Load data
# =============================================================================
gender = pd.read_csv(
    os.path.join(path_to_data, "raw_data_encrypted_final.csv.zip")
)
gender.head()


# =============================================================================
# Filter sample
# =============================================================================
# Ensure quarter is a quarterly Period
q = pd.PeriodIndex(gender['quarter'], freq='Q')

mask = (
    (q > pd.Period('2024Q1'))
    & gender['gender'].notna()
    & gender['ai_share'].notna()
)

gender = gender.loc[mask].copy()


# =============================================================================
# Collapse to one row per user
# =============================================================================
user_level = (
    gender
    .groupby('user_hashed')
    .agg(
        ai_share_mean=('ai_share', 'mean'),
        # Take the modal gender per user; fall back to first if no mode
        gender=(
            'gender',
            lambda x: x.mode().iat[0] if not x.mode().empty else x.iloc[0]
        )
    )
    .reset_index()
)


# =============================================================================
# Variable preparation
# =============================================================================
# Categorical gender
user_level['gender'] = user_level['gender'].astype('category')


# =============================================================================
# Regression: user-level averages
# =============================================================================
model = pf.feols(
    "ai_share_mean ~ C(gender)",
    user_level
)

model_rob = model.vcov("HC1")


# =============================================================================
# Regression table
# =============================================================================
pf.etable(
    model_rob,
    title="Effect of Gender on Mean AI Share (user-level)",
    notes="Outcome averaged per user. HC1 robust standard errors."
)

In [None]:
users = pd.read_parquet(
    os.path.join(path_to_data, "AI_010_data_uq.parquet")
)
users.head()


# =============================================================================
# Restrict to users present in AI_010 dataset
# =============================================================================
keep = users.IDu_hash.unique().tolist()

gender = gender[gender['user_hashed'].isin(keep)]


# =============================================================================
# Imports
# =============================================================================


# =============================================================================
# Collapse to one row per user
# =============================================================================
user_level = (
    gender
    .groupby('user_hashed')
    .agg(
        ai_share_mean=('ai_share', 'mean'),
        # Take the modal gender per user; fall back to first if no mode
        gender=(
            'gender',
            lambda x: x.mode().iat[0] if not x.mode().empty else x.iloc[0]
        )
    )
    .reset_index()
)


# =============================================================================
# Variable preparation
# =============================================================================
# Categorical gender
user_level['gender'] = user_level['gender'].astype('category')


# =============================================================================
# Regression: user-level averages
# =============================================================================
model = pf.feols(
    "ai_share_mean ~ C(gender)",
    user_level
)

model_rob = model.vcov("HC1")


# =============================================================================
# Regression table
# =============================================================================
pf.etable(
    model_rob,
    title="Effect of Gender on Mean AI Share (user-level)",
    notes="Outcome averaged per user. HC1 robust standard errors."
)

# Experience

In [None]:



custom_style = {
    # Font sizes
    "axes.labelsize": 25,
    "axes.titlesize": 20,
    "xtick.labelsize": 20,
    "ytick.labelsize": 20,

    # Line and marker styles
    "lines.linewidth": 3,
    "lines.markersize": 8,
    "lines.color": "black",
    "errorbar.capsize": 5,

    # Axes & spines
    "axes.edgecolor": "black",
    "axes.linewidth": 2,

    # Tick styling
    "xtick.color": "black",
    "ytick.color": "black",
    "xtick.major.width": 1.2,
    "ytick.major.width": 1.2,

    # Grid
    "axes.grid": True,
    "grid.color": "gray",
    "grid.linewidth": 0.7,
    "grid.linestyle": "--",
    "grid.alpha": 0.6,

    # Figure settings
    "figure.figsize": (12, 8),
    "figure.dpi": 300,
    "figure.facecolor": "white"
}


plt.rcParams.update(custom_style)

In [None]:

df=pd.read_csv(os.path.join(path_to_data,"raw_data_encrypted_final.csv.zip"))


df['year'] = df['quarter'].str[:4].astype(int)

In [None]:
years=df.year.unique().tolist()
years=sorted(years)
years

In [None]:
users=pd.read_parquet(os.path.join(path_to_data,"AI_010_data_uq.parquet"))
users.head()

keep=users.IDu_hash.unique().tolist()
len(keep),df.user_hashed.nunique()

In [None]:
df = df[df['user_hashed'].isin(keep)]

In [None]:
df['year'] = df['quarter'].str[:4].astype(int)



In [None]:


# =============================================================================
# Plot styling
# =============================================================================
plt.rcParams.update(custom_style)


# =============================================================================
# Parameters
# =============================================================================
const = 0.1


# =============================================================================
# Loop over years and center by the 2019 baseline
# =============================================================================
etables_by_year = {}

for year in years:
    print(f"\n--- Processing Year {year} ---")

    df_y = df[df['year'] == year].copy()

    user_means = (
        df_y
        .groupby('user_hashed')
        .agg(
            ai_share=('ai_share', 'mean'),
            experience=('experience', 'first')
        )
        .reset_index()
    )

    user_means['experience'] = ((user_means['experience'] + 1) // 2) * 2

    user_means['ai_share_centered'] = user_means['ai_share'] - const

    model = pf.feols("ai_share_centered ~ -1 + C(experience)", user_means)

    model_vcov = model.vcov("HC1")  # 

    etables_by_year[year] = pf.etable(
        model_vcov,
        title=f"Effect of Experience on AI Share (Centered at 2019 Baseline) - {year}",
        notes="Outcome centered so 2019 baseline equals zero.",
        return_format="dataframe"
    )

In [None]:
etables_by_year[2024]

In [None]:
etables_by_year[2019]