In [4]:
import pandas as pd
import networkx as nx
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

# Construction Specific Analysis

In this notebook, we will be doing the analysis to find construction - specific effects. As a reminder, this consists of three main parts:

1. Crafting our transfer matrix
2. Network Analysis
3. Asymmetry Analysis/Tiering

First, however, let us load in the data we are interested in. For now this will just be the single-clause single-construction generalization data. However, we eventually want to expand to the multi-clause setting too.

In [5]:
embedded = False
size = "1.4b"

In [6]:
data = pd.read_parquet(f"../results/generalization/embedded_clause_single_construction_classic_{size}.parquet") if embedded else \
    pd.read_parquet(f"../results/generalization/single_clause_single_construction_classic_{size}.parquet")

## 1. Crafting the Transfer Matrix

In this section, we will craft the transfer matrix. This is a $(n+2) * (n+2)$ matrix, in which the rows and columns represent the $n$ constructions and $2$ controls we are evaluating the transfer into. We construct this matrix below.

### Step One: Filter by cases where the animacy conditions are the same.

In [7]:
if embedded:
    data = data[(data["animacy_condition"] != "control") | 
                (data["animacy_condition"] != "control_lexical")]
else:
    data = data[(data["animacy_condition"] != "embedded_control") | 
                (data["animacy_condition"] != "embedded_control_lexical")]

### Step Two: Average the values by parent construction.
This means that the item representing, for example, *Embedded Finite Wh - Questions* → *Embedded Non-Finite Wh - Questions* will be equal to the average value of :
1. Animate Embedded Finite Wh - Questions → Animate Embedded Non-Finite Wh - Questions 
2. Inanimate Embedded Finite Wh - Questions → inanimate Embedded Non-Finite Wh - Questions

In [8]:
data.head()

Unnamed: 0,pos,from,to,model,seed,leave_out,single_double,max_avg,animate_from,animate_to,parent_construction_from,parent_construction_to,animacy_condition,normal
0,{filler},cleft_animate,cleft_animate,pythia-1.4b,41,False,False,3.988794,True,True,cleft,cleft,SameDataset_SameAnimacy,1.0
1,{filler},cleft_animate,cleft_inanimate,pythia-1.4b,41,False,False,3.781297,True,False,cleft,cleft,SameDataset_DiffAnimacy,0.94798
2,{filler},cleft_animate,control,pythia-1.4b,41,False,False,0.138282,True,True,cleft,control,control,0.034668
3,{filler},cleft_animate,control_lexical,pythia-1.4b,41,False,False,0.040439,True,True,cleft,control_lexical,control_lexical,0.010138
4,{filler},cleft_animate,embedded_wh_finite_animate,pythia-1.4b,41,False,False,0.799119,True,True,cleft,embedded_wh_finite,DiffDataset_SameAnimacy,0.200341


In [9]:
from_parents = pd.unique(data["parent_construction_from"])
to_parents = pd.unique(data["parent_construction_to"])
unique_pos = pd.unique(data["pos"])

transfer_data = []

for pos in unique_pos:
    for f_parent in from_parents:
        for t_parent in to_parents:
            filtered_data = data[(data["pos"] == pos) & 
                                 (data["parent_construction_from"] == f_parent) & 
                                 (data["parent_construction_to"] == t_parent)]
            
            if not filtered_data.empty:
                normal_val = filtered_data["normal"].mean()
                normal_std = filtered_data["normal"].std()
                
                transfer_data.append({
                    "pos": pos,
                    "parent_from": f_parent,
                    "parent_to": t_parent,
                    "mean_normal": normal_val,
                    "std_normal": normal_std,
                    "data_points": filtered_data["normal"].tolist(),
                    "n_samples": len(filtered_data)
                })

transfer_df = pd.DataFrame(transfer_data)
            


Now transfer_df is our transfer matrix! Let's take a look at it.

In [10]:
transfer_df

Unnamed: 0,pos,parent_from,parent_to,mean_normal,std_normal,data_points,n_samples
0,{filler},cleft,cleft,0.945206,0.078820,"[1.0, 0.9479801325466016, 0.8328432313463736, ...",4
1,{filler},cleft,control,0.046139,0.016223,"[0.03466758061428266, 0.057610572646662375]",2
2,{filler},cleft,control_lexical,0.009132,0.001423,"[0.010138166441087537, 0.008125406540241074]",2
3,{filler},cleft,embedded_wh_finite,0.089693,0.078533,"[0.2003410966874174, 0.03012053132554714, 0.09...",4
4,{filler},cleft,embedded_wh_nonfinite,0.057269,0.068639,"[0.15624271390433356, 0.007943775647185933, 0....",4
...,...,...,...,...,...,...,...
247,{verb},wh_question,embedded_wh_nonfinite,0.211237,0.110127,"[0.2557628494508001, 0.06806425463981211, 0.19...",4
248,{verb},wh_question,pseudo_cleft,0.841505,0.244043,"[1.0384959650777246, 0.6163913357317051, 0.644...",4
249,{verb},wh_question,restricted_rc,0.224899,0.137247,"[0.23430738971416312, 0.06704753584215735, 0.1...",4
250,{verb},wh_question,topicalization,0.744543,0.252308,"[0.9495505102545027, 0.5333496301381966, 0.519...",4


## 2: Network Analysis

Now, we want to do our network analysis. The transfer matrix serves as an adjacency matrix, and now we want to calculate a host of metrics at different thresholds.

In [11]:
def calculate_centrality(adj_matrix):
    G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
    G.remove_edges_from(list(nx.selfloop_edges(G)))
    
    # Calculate centrality measures
    out_centrality = nx.out_degree_centrality(G)
    in_centrality = nx.in_degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)
    
    return {
        "out_centrality": out_centrality,
        "in_centrality": in_centrality,
        "betweenness_centrality": betweenness_centrality
    }

In [12]:
def get_control_threshold(df, parent_from, type = "both"):
    relevant_controls = df[(df["parent_from"] == parent_from) &
                           (df["parent_to"] == "control")]
    relevant_controls_lexical = df[(df["parent_from"] == parent_from) &
                           (df["parent_to"] == "control_lexical")]
    
    if type == "lexical":
        return relevant_controls_lexical["mean_normal"].max()
    elif type == "normal":
        return relevant_controls["mean_normal"].max()
    else:
        return max(relevant_controls["mean_normal"].max(), 
               relevant_controls_lexical["mean_normal"].max())

In [13]:
def create_adjacency_matrix(df, threshold = None):
    adj_matrix = pd.DataFrame(0, index=df["parent_from"].unique(), columns=df["parent_to"].unique())
    adj_matrix = adj_matrix.astype(float)

    parents_from = pd.unique(df["parent_from"])
    parents_to = pd.unique(df["parent_to"])

    for pf in parents_from:
        
        max_control = get_control_threshold(df, pf)
        
        threshold = max_control if threshold is None else threshold

        for pt in parents_to:
            filtered_data = df[(df["parent_from"] == pf) & 
                               (df["parent_to"] == pt)]
            adj_matrix.loc[pf, pt] = float(filtered_data["mean_normal"].mean()) if filtered_data["mean_normal"].mean() > threshold else float(0.0)
    
    return adj_matrix

In [14]:
def collect_centrality_across_thresholds(pos_df, thresholds, from_parents):
    
    thresholds_list = []
    out_centrality_values = {parent: [] for parent in from_parents}
    in_centrality_values = {parent: [] for parent in from_parents}
    betweenness_centrality_values = {parent: [] for parent in from_parents}

    for threshold in thresholds:
        thresholds_list.append(threshold)
        adj_matrix = create_adjacency_matrix(pos_df, threshold)
        centrality_measures = calculate_centrality(adj_matrix)
        for parent in from_parents:
            out_centrality_values[parent].append(
                centrality_measures['out_centrality'].get(parent, 0)
            )
            in_centrality_values[parent].append(
                centrality_measures['in_centrality'].get(parent, 0)
            )
            betweenness_centrality_values[parent].append(
                centrality_measures['betweenness_centrality'].get(parent, 0)
            )
    
    return {
        "thresholds_list": thresholds_list,
        "out_centrality_values": out_centrality_values,
        "in_centrality_values": in_centrality_values,
        "betweenness_centrality_values": betweenness_centrality_values
    }

In [15]:
def plot_single_centrality_measure(
    thresholds_list, 
    centrality_values, 
    from_parents, 
    control_threshold_normal, 
    control_threshold_lexical,
    title,
    ylabel
):
    for parent in from_parents:
        plt.plot(thresholds_list, centrality_values[parent], marker='o', label=parent)
        # Mark control threshold values
        plt.axvline(x=control_threshold_normal[parent], color='gray', linestyle='--', alpha=0.5)
        plt.axvline(x=control_threshold_lexical[parent], color='lightgray', linestyle=':', alpha=0.5)
    
    plt.title(title)
    plt.xlabel('Threshold')
    plt.ylabel(ylabel)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)

In [16]:
def plot_centrality_measures(
    thresholds_list, 
    out_centrality_values, 
    in_centrality_values, 
    betweenness_centrality_values,
    control_threshold_normal,
    control_threshold_lexical,
    from_parents,
    pos
):
    # Create a figure with multiple subplots for each centrality measure
    plt.figure(figsize=(18, 15))
    
    # Plot out-degree centrality
    plt.subplot(3, 1, 1)
    plot_single_centrality_measure(
        thresholds_list, 
        out_centrality_values, 
        from_parents, 
        control_threshold_normal, 
        control_threshold_lexical,
        'Out-degree Centrality vs Threshold',
        'Out-degree Centrality'
    )
    
    # Plot in-degree centrality
    plt.subplot(3, 1, 2)
    plot_single_centrality_measure(
        thresholds_list, 
        in_centrality_values, 
        from_parents, 
        control_threshold_normal, 
        control_threshold_lexical,
        'In-degree Centrality vs Threshold',
        'In-degree Centrality'
    )
    
    # Plot betweenness centrality
    plt.subplot(3, 1, 3)
    plot_single_centrality_measure(
        thresholds_list, 
        betweenness_centrality_values, 
        from_parents, 
        control_threshold_normal, 
        control_threshold_lexical,
        'Betweenness Centrality vs Threshold',
        'Betweenness Centrality'
    )
    
    plt.suptitle(f'Centrality Measures for {pos}', fontsize=16)
    plt.tight_layout()
    plt.show()

In [17]:
def create_centrality_plot_for_pos(pos, transfer_df):
    pos_df = transfer_df[transfer_df["pos"] == pos]
    
    thresholds = np.linspace(0, 1, 101)

    control_threshold_lexical = {
        parent_from: get_control_threshold(pos_df, parent_from, "lexical") for parent_from in from_parents
    }
    control_threshold_normal = {
        parent_from: get_control_threshold(pos_df, parent_from, "normal") for parent_from in from_parents
    }
    

    centrality_data = collect_centrality_across_thresholds(pos_df, thresholds, from_parents)
    
    plot_centrality_measures(
        centrality_data["thresholds_list"],
        centrality_data["out_centrality_values"],
        centrality_data["in_centrality_values"],
        centrality_data["betweenness_centrality_values"],
        control_threshold_normal,
        control_threshold_lexical,
        from_parents,
        pos
    )

In [18]:
def get_auc(centrality_values, thresholds, control_threshold = False):
    # Calculate the area under the curve (AUC) for each centrality measure
    auc_values = {}
    for parent, values in centrality_values.items():
        control_threshold = 0 if not control_threshold else get_control_threshold(transfer_df[transfer_df["pos"] == pos], parent, "both")
        valid_indices = [i for i, t in enumerate(thresholds) if t >= control_threshold]
        
        if valid_indices:
            # Get the valid thresholds and corresponding values
            valid_thresholds = [thresholds[i] for i in valid_indices]
            valid_values = [values[i] for i in valid_indices]
            
            # Calculate AUC for the valid portion
            auc = np.trapz(valid_values, valid_thresholds)
        else:
            auc = 0.0
        auc_values[parent] = auc
    return auc_values

def plot_auc(auc_values, title):
    plt.figure(figsize=(10, 6))
    for parent, auc in auc_values.items():
        plt.bar(parent, auc)
    
    plt.title(title)
    plt.xlabel('Parent Construction')
    plt.ylabel('AUC')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

Now that we have set up our helper functions, we can get our AUC's, and save them as csv's.

In [21]:
if embedded:
    num_panels = 10
else:
    num_panels = 5

# CHANGE THIS TO TRUE IF YOU WANT TO USE CONTROL THRESHOLD (I.E. only keep values above the control)
control_threshold = False

# # Export CSVs with AUC data for each centrality type
def save_auc_csv(centrality_key, filename):
    records = []
    for pos in unique_pos[:num_panels-1]:
        data = collect_centrality_across_thresholds(
            transfer_df[transfer_df["pos"] == pos],
            np.linspace(0, 2, 101),
            from_parents
        )
        auc_vals = get_auc(data[f"{centrality_key}_values"], data["thresholds_list"], control_threshold)
        max_v = max(auc_vals.values()) if auc_vals else 0
        for parent, auc in auc_vals.items():
            norm = (auc / max_v) if max_v > 0 else auc
            records.append({
                "position": pos,
                "parent": parent,
                "auc": auc,
                "normalized_auc": norm
            })
    pd.DataFrame(records).to_csv(filename, index=False)
    print(f"Saved {filename}")

# Generate CSVs
label = "" if not embedded else "embedded_"

save_auc_csv("out_centrality", f"../results/generalization/{label}out_degree_auc_{size}.csv")
save_auc_csv("in_centrality", f"../results/generalization/{label}in_degree_auc_{size}.csv")


  auc = np.trapz(valid_values, valid_thresholds)
  auc = np.trapz(valid_values, valid_thresholds)
  auc = np.trapz(valid_values, valid_thresholds)
  auc = np.trapz(valid_values, valid_thresholds)


Saved ../results/generalization/out_degree_auc_1.4b.csv


  auc = np.trapz(valid_values, valid_thresholds)
  auc = np.trapz(valid_values, valid_thresholds)
  auc = np.trapz(valid_values, valid_thresholds)


Saved ../results/generalization/in_degree_auc_1.4b.csv


  auc = np.trapz(valid_values, valid_thresholds)
