In [None]:
import pandas as pd
import numpy as np
import os
from datetime import date
import time
import json
today = date.today()
path = os.getcwd()
print(f'üíö Today is {today}')

üíö Today is 2025-06-11


# Trying the network graphs with rearranged categories

In [47]:
## Rearranging the methods and data types
re_methods = {"Multigenerational Measures": "Others_Measure",
              "Non‚Äêparametric Approaches": "Others_Measure",
              "Others": "Others_Measure",}

re_dttype = {"Opportunity Atlas": "Linked Administrative Data",
             "International Panel Data": "Panel/Longitudinal Surveys",
             "University/Institution Data": "Others_DataType",
             "Pseudo-Panel/Household Budget Survey": "National Survey Data",
             "Archival/Historical Data": "Administrative/Registry Data",
             "Big Data": "Others_DataType",
             "Others": "Others_DataType"}

re_rqtype = {"Others": "Others_RqType"}
# *------For Research Question Types, it was too difficult to rearrange.

In [None]:
from neo4j import GraphDatabase
import pandas as pd

driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "your_password"))

data_type_query = """
MATCH (d:DataType)
WHERE d.name IS NOT NULL
SET d.rearranged = 
  CASE d.name
    WHEN "Opportunity Atlas" THEN "Linked Administrative Data"
    WHEN "International Panel Data" THEN "Panel/Longitudinal Surveys"
    WHEN "University/Institution Data" THEN "Others_DataType"
    WHEN "Pseudo-Panel/Household Budget Survey" THEN "National Survey Data"
    WHEN "Archival/Historical Data" THEN "Administrative/Registry Data"
    WHEN "Big Data" THEN "Others_DataType"
    ELSE d.name
  END
"""

measure_query = """
MATCH (m:Measure)
WHERE m.name IS NOT NULL
SET m.rearranged = 
  CASE m.name
    WHEN "Multigenerational Measures" THEN "Others_Measure"
    WHEN "Non‚Äêparametric Approaches" THEN "Others_Measure"
    ELSE m.name
  END
"""

# rq_type_query = """
# MATCH (r:RqType)
# WHERE r.name IS NOT NULL
# SET r.rearranged =
#     CASE r.name
#         WHEN "Others" THEN "Others_RqType"
#         ELSE r.name
#     END;
# """

with driver.session() as session:
    session.run(data_type_query)
    session.run(measure_query)
    # session.run(rq_type_query)

driver.close()


# Graph Viz

In [68]:
# Graph Data:
df_feature_edges = pd.read_csv(f'{path}/results/weights_properties_citations_noTypeYear.csv')
# df_feature_edges = pd.read_csv(f'{path}/results/weights_properties_citations_noTypeYear_rearranged-values.csv')
# With the new values with the titles in the prompt:
# df_feature_edges = pd.read_csv(f'{path}/results/weights_properties_citations_noTypeYear_titled-values.csv')

In [69]:
def scale_weight(w, min_w, max_w, min_thick=1, max_thick=8):
    if max_w == min_w:
        return (min_thick + max_thick) / 2
    return min_thick + (w - min_w) / (max_w - min_w) * (max_thick - min_thick)



color_labels = {
                # Measure Categories
                "Regression‚Äêbased Measures" : "#B5DBA5",
                "Rank‚Äêbased Measures": "#B5DBA5",
                "Transition Matrix / Probability Measures": "#B5DBA5",
                "Absolute Mobility Measures": "#B5DBA5",
                "Multigenerational Measures": "#B5DBA5",
                "Decomposition / Structural Approaches": "#B5DBA5",
                "Non‚Äêparametric Approaches": "#B5DBA5",
                "Others_Measure": "#B5DBA5",
                
                # Data Categories
                "Panel/Longitudinal Surveys": "#339999",
                "Administrative/Registry Data": "#339999",
                "National Survey Data": "#339999",
                "Opportunity Atlas": "#339999",
                "Natural/Experimental Data": "#339999",
                "Linked Administrative Data": "#339999",
                "International Panel Data": "#339999",
                "Rich List Data": "#339999",
                "University/Institution Data": "#339999",
                "Pseudo-Panel/Household Budget Survey": "#339999",
                "Archival/Historical Data": "#339999",
                "Big Data": "#339999",
                "No dataset": "#339999",
                "Others_DataType": "#339999",
                
                # RQ Categories
                "Measurement and Methodological Advances": "#F4A988",
                "Empirical Estimates and Determinants": "#F4A988",
                "Policy, Institutional, and Geographic Impacts": "#F4A988",
                "Intergenerational Wealth Mobility and Inheritance": "#F4A988",
                "Demographic Differences in Mobility (Race, Gender, etc.)" : "#F4A988",
                "Mobility and Non-Income Outcomes (Health, Wellbeing, etc.)": "#F4A988",
                "Theoretical and Structural Models": "#F4A988",
                "Perceptions of Mobility and Attitudes": "#F4A988",
                "Others_RqType": "#F4A988"}

In [None]:
import plotly.graph_objects as go
import networkx as nx
import math

# Build the graph
G = nx.DiGraph()
for _, row in df_feature_edges.iterrows():
    G.add_edge(row['from_val'], row['to_val'], weight=row['weight'])

# Layout
pos = nx.spring_layout(G, seed=42, k=5 / math.sqrt(G.order()))

# --- CENTRALITY METRICS ---
betweenness = nx.betweenness_centrality(G)
degree = dict(G.degree())

# How many top central nodes to show
top_k = 7 # total = 22

# Top-k nodes by betweenness centrality
top_betweenness_nodes = sorted(betweenness, key=betweenness.get, reverse=True)[:top_k]
# Top-k nodes by degree centrality
top_degree_nodes = sorted(degree, key=degree.get, reverse=True)[:top_k]

# --- GEOMETRIC CENTER ZONE ---
center_radius = 0.115
geometric_center_nodes = [node for node, (x, y) in pos.items() if abs(x) <= center_radius and abs(y) <= center_radius]

# --- CREATE SUBGRAPH FUNCTION ---
def make_subgraph_plot(subgraph_nodes_ordered, centrality_dict, title):
    G_sub = G.subgraph(subgraph_nodes_ordered).copy()
    pos_sub = {n: pos[n] for n in G_sub.nodes()}
    
    edge_weights = [G_sub[u][v]['weight'] for u, v in G_sub.edges()]
    min_w, max_w = min(edge_weights, default=1), max(edge_weights, default=1)
    scaled_widths = [scale_weight(w, min_w, max_w) for w in edge_weights]
    
    # Edge traces
    edge_traces = []
    for (u, v), width in zip(G_sub.edges(), scaled_widths):
        edge_traces.append(go.Scatter(
            x=[pos_sub[u][0], pos_sub[v][0], None],
            y=[pos_sub[u][1], pos_sub[v][1], None],
            mode='lines',
            line=dict(width=width, color='gray'),
            showlegend=False,
        ))

    # Node traces with correct rank labels
    node_traces = []
    for i, node in enumerate(subgraph_nodes_ordered):
        x, y = pos_sub[node]
        rank_label = str(i + 1)  # ranked order (1 = highest centrality, or closest to center)
        node_traces.append(go.Scatter(
            x=[x],
            y=[y],
            mode='markers+text',
            text=[rank_label],
            textposition='middle center',
            textfont=dict(size=24, color='white'),
            marker=dict(
                size=30,
                color=color_labels.get(node, '#cccccc'),
                line=dict(width=2, color=color_labels.get(node, '#cccccc')),            ),
            hoverinfo='text',
            name=node,
            showlegend=True
        ))

    return go.Figure(data=edge_traces + node_traces, layout=go.Layout(
        title=title,
        showlegend=True,
        hovermode='closest',
        plot_bgcolor='rgba(255,255,255,0)',
        margin=dict(b=20, l=5, r=5, t=40),
        font=dict(size=16, color='black'),
        # font_family="Times New Roman",
        font_family = "Avenir",
        xaxis=dict(showgrid=False, zeroline=False),
        yaxis=dict(showgrid=False, zeroline=False)
    ))



# --- DISPLAY THREE PLOTS ---
# Sort geometric center nodes by distance to origin
geom_sorted = sorted(geometric_center_nodes, key=lambda n: pos[n][0]**2 + pos[n][1]**2)

fig_geom = make_subgraph_plot(geom_sorted, {}, 'Geometric Center Nodes')
fig_betw = make_subgraph_plot(top_betweenness_nodes, betweenness, 'Top Betweenness Centrality Nodes')
fig_deg = make_subgraph_plot(top_degree_nodes, degree, 'Top Degree Centrality Nodes')

fig_geom.show()
fig_betw.show()
fig_deg.show()




In [125]:
timestamp = time.strftime("%Y-%m-%d_%H%M%S")
fig_geom.write_image(f'{path}/results/Paper/img/feature_geom_center_graph_{timestamp}.svg')
fig_betw.write_image(f'{path}/results/Paper/img/feature_top_betweenness_graph_{timestamp}.svg')
fig_deg.write_image(f'{path}/results/Paper/img/feature_top_degree_graph_{timestamp}.svg')

In [115]:
# Build the graph
G = nx.DiGraph()
for _, row in df_feature_edges.iterrows():
    G.add_edge(row['from_val'], row['to_val'], weight=row['weight'])

# Layout
pos = nx.spring_layout(G, seed=42, k=5 / math.sqrt(G.order()))

# Edge thickness based on weight
edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
min_w, max_w = min(edge_weights), max(edge_weights)

scaled_widths = [scale_weight(w, min_w, max_w) for w in edge_weights]


edge_traces = []
for (u, v), width in zip(G.edges(), scaled_widths):
    edge_traces.append(go.Scatter(
        x=[pos[u][0], pos[v][0], None],
        y=[pos[u][1], pos[v][1], None],
        mode='lines',
        line=dict(width=width, color='gray'),
        showlegend=False,
        name=f'{u} ‚Üí {v}',
        hoverinfo='text',
        text=[f'{u} ‚Üí {v}<br>Weight: {G[u][v]["weight"]}']
    ))

# Map node labels to color
node_labels = list(G.nodes())
x_pos = [pos[node][0] for node in node_labels]
y_pos = [pos[node][1] for node in node_labels]


# Create individual node traces with legends
node_traces = []
for i, label in enumerate(node_labels):
    node_traces.append(go.Scatter(
        x=[x_pos[i]],
        y=[y_pos[i]],
        mode='markers',
        name=label,
        marker=dict(
            size=30,
            color=color_labels[label],
            line=dict(width=2, color=color_labels[label])
        ),
        hoverinfo='text',
        text=[label],
        showlegend=True
    ))

# Final plot
fig = go.Figure(data=edge_traces + node_traces, layout=go.Layout(
    showlegend=False,
    hovermode='closest',
    # paper_bgcolor='rgba(0, 0, 0, 0)',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(b=20, l=5, r=5, t=40),
    font=dict(size=16, color='black'),
    font_family="Times New Roman",
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False)
))

fig.show()


In [None]:
# fig.write_image(f'{path}/results/Paper/img/feature_citation_graph_{timestamp}.svg')

In [29]:
citations_617 = pd.read_csv(f'{path}/results/citations_count_617only_with_ranks.csv')
citations_617['re_category_1'] = citations_617['category_1'].replace(re_methods)
citations_617['re_category_2'] = citations_617['category_2'].replace(re_methods)
citations_617['re_data_cat'] = citations_617['data_cat'].replace(re_dttype)
citations_617['re_rq_cat'] = citations_617['rq_cat'].replace(re_rqtype)

In [30]:
citations_617.head()

Unnamed: 0,id,citations_count,title,data_cat,rq_cat,category_1,category_2,citation_rank,cited_by_count,citation_rank_617,re_category_1,re_category_2,re_data_cat,re_rq_cat
0,https://openalex.org/W2105584013,169,Where is the land of Opportunity? The Geograph...,Linked Administrative Data,Empirical Estimates and Determinants,Rank‚Äêbased Measures,Transition Matrix / Probability Measures,1.0,2341.0,1.0,Rank‚Äêbased Measures,Transition Matrix / Probability Measures,Linked Administrative Data,Empirical Estimates and Determinants
1,https://openalex.org/W2022256544,150,An Equilibrium Theory of the Distribution of I...,No dataset,Theoretical and Structural Models,Decomposition / Structural Approaches,Decomposition / Structural Approaches,2.0,2152.0,2.0,Decomposition / Structural Approaches,Decomposition / Structural Approaches,No dataset,Theoretical and Structural Models
2,https://openalex.org/W2122216841,131,"Income Inequality, Equality of Opportunity, an...",International Panel Data,Empirical Estimates and Determinants,Regression‚Äêbased Measures,,3.0,1529.0,3.0,Regression‚Äêbased Measures,,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
3,https://openalex.org/W2091916425,112,Fortunate Sons: New Estimates of Intergenerati...,Administrative/Registry Data,Empirical Estimates and Determinants,Regression‚Äêbased Measures,,6.0,654.0,4.0,Regression‚Äêbased Measures,,Administrative/Registry Data,Empirical Estimates and Determinants
4,https://openalex.org/W3093752946,98,Trends in Intergenerational Income Mobility,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,Regression‚Äêbased Measures,12.0,445.0,5.0,Regression‚Äêbased Measures,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants


In [32]:
import plotly.graph_objects as go
import pandas as pd

# Define the variables you want dropdown control for
variables = ['re_category_1', 're_category_2', 're_data_cat', 're_rq_cat']

# Bin citation rank
citations_617['percentile_bin'] = pd.qcut(
    citations_617['citation_rank_617'],
    q=10,
    labels=[f"P{i*10}-{(i+1)*10}" for i in range(10)],
    duplicates='drop'
)

# Create color scale
custom_colorscale = [
    [0.0, 'lightgray'],
    [0.000001, 'rgb(68,1,84)'],
    [1.0, 'rgb(253,231,37)']
]

# Store all heatmap traces
traces = []
buttons = []

for i, variable in enumerate(variables):
    pivot_table = (
        citations_617
        .groupby([variable, 'percentile_bin'], observed=True)['citations_count']
        .sum()
        .unstack(fill_value=0)
    )
    pivot_table = pivot_table / pivot_table.values.sum()

    trace = go.Heatmap(
        z=pivot_table.values,
        x=pivot_table.columns.astype(str),
        y=pivot_table.index,
        colorscale=custom_colorscale,
        zmin=0.00001,
        zmax=0.05,  # or pivot_table.values.max() if you want adaptive range
        xgap=2,
        ygap=2,
        visible=(i == 0),  # Only first is visible initially
        colorbar=dict(title="Normalized Citations")
    )
    traces.append(trace)

    # Add dropdown button
    button = dict(
        label=variable,
        method='update',
        args=[
            {'visible': [j == i for j in range(len(variables))]},
            {'title': f'Total Citations by {variable} and Citation Rank Percentile',
             'yaxis': {'title': variable}}
        ]
    )
    buttons.append(button)

# Create the figure with all traces and the dropdown
fig = go.Figure(data=traces)

fig.update_layout(
    updatemenus=[
        dict(
            type='dropdown',
            showactive=True,
            buttons=buttons,
            x=1.05,
            xanchor='left',
            y=1.1,
            yanchor='top'
        )
    ],
    title=f"Total Citations by {variables[0]} and Citation Rank Percentile",
    xaxis_title="Citation Rank Percentile",
    yaxis_title=variables[0]
)

fig.show()


# GPT: Reassigning the categories (Consistency Check)

In [3]:
from openai import OpenAI
from dotenv import load_dotenv
# OpenAI
load_dotenv() 
openai_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_API_KEY)
gpt_model = "o3-mini" #o3-mini-2025-01-31

In [2]:
import re

def clean_abs(abs_text):
    abs_text = re.sub(r'[^\x00-\x7F]+', '', abs_text)  # Remove non-ASCII characters (like Korean)
    abs_text = re.sub(r'\bp\.\s*\d+\b', '', abs_text)  # Remove page numbers
    abs_text = re.sub(r'[A-Z][a-z]+(?:\s+(?:and|&)\s+[A-Z][a-z]+)*(?:,\s+[A-Z][a-z]+)*\s*\(\d{4}\)', '', abs_text)  # Remove parenthesis citation
    abs_text = re.sub(r'[A-Z][a-z]+(?:\s+(?:and|&)\s+[A-Z][a-z]+)*(?:,\s+[A-Z][a-z]+)*(?:,\s*\d{4})', '', abs_text)  # Remove text citation
    
    abs_text = re.sub(r'<[^>]+>', '', abs_text)  # Remove HTML tags
    abs_text = re.sub(r'\b(?:ABSTRACT|Abstract)\b', '', abs_text) # Remove the term 'ABSTRACT' or 'Abstract' (case-sensitive, preserving other variations)
    abs_text = re.sub(r'https?://\S+', '', abs_text)
    abs_text = re.sub(r'www\.\S+', '', abs_text)
    abs_text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', '', abs_text)
    abs_text = re.sub('intergenerational elasticity', 'IGE', abs_text, flags=re.IGNORECASE)
    # Normalize white spaces
    abs_text = re.sub(r'(?<!\w)[.,]+(?!\w)', '', abs_text)  # Remove isolated periods and commas
    abs_text = re.sub(r'\s+', ' ', abs_text).strip()
    return abs_text

In [6]:
from pydantic import BaseModel

class GetCat(BaseModel):
    category: int
    data_type: str

In [2]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')
df.shape

(617, 56)

In [8]:
def estimate_cost(prompt_toks, completion_toks, model="o3-mini"):
    pricing = {
        "o3-mini":{"prompt": 0.0011, "completion": 0.0044},  # pricing per 1K tokens
    }
    rate = pricing[model]
    return (prompt_toks / 1000) * rate["prompt"] + (completion_toks / 1000) * rate["completion"]

## Methods

In [None]:
import logging

cat = {}
estimated_costs = {}

for i, row in df.iterrows():
# for i, row in df[~df.index.isin(cat.keys())].iterrows():  # Only process rows without those already in 'cat'
    abs = row['abs'].strip()
    abs_clean = clean_abs(abs)
    print(f'Processing {row.id}')
    start_time = time.time()
    completion = openai_client.beta.chat.completions.parse(
                model= gpt_model,
                messages=[
                    { 
                    "role": "assistant",
                    "content": "You are a research assistant in the Social Sciences domain. I, the researcher, have an abstract of a paper that studied inter/multi generational wealth/income/earning mobility. I want you to categorize the measure(s) that the paper used based on the abstract."
                    
                    },
                    {
                        "role": "user", 
                        "content":f"""
                                ```Title: {row['title']}
                                                        
                                ```Abstracts
                                {abs_clean}
                                ```
                                First, recognize what measure(s) the papers used based on the abstracts. Here, 'measures' mean an estimate from a model or an equation that quantifies the inter/multi generational wealth/income/earning mobility.
                                Does it belong to any of the following categories?
                                1. Regression‚Äêbased Measures 
                                2. Rank‚Äêbased Measures
                                3. Transition Matrix / Probability Measures
                                4. Absolute Mobility Measures
                                5. Multigenerational Measures
                                6. Decomposition / Structural Approaches
                                7. Non‚Äêparametric Approaches
                                8. Others
                                
                                Return the category that the measure(s) belong to as a number, and the exact name of the measure(s) if possible. For example, if it belongs to '1', return also the name of the measure(s) such as 'Intergenerational Elasticity (IGE)'. 
                                Don't repeat the name of category here like 'Regression‚Äêbased Measures - IGE'.
                                If the measure(s) do not belong to any category, return '8' and the exact name of the measure.
                                """
                    }
                ],
                response_format=GetCat,
            )
    results = completion.choices[0].message.content
    tokens_prompt = completion.usage.prompt_tokens
    tokens_completion = completion.usage.completion_tokens
    cost = estimate_cost(tokens_prompt, tokens_completion, model=gpt_model)
    estimated_costs[i] = cost
    cat[i] = results   
        
logging.info('‚úÖ Done! The process time is:', time.time()-start_time)

Processing https://openalex.org/W3176622639
Processing https://openalex.org/W3176921257
Processing https://openalex.org/W3178385122
Processing https://openalex.org/W3181553951
Processing https://openalex.org/W3181973673
Processing https://openalex.org/W3182550406
Processing https://openalex.org/W3184422859
Processing https://openalex.org/W3185155858
Processing https://openalex.org/W3186457560
Processing https://openalex.org/W3189747191
Processing https://openalex.org/W3190097825
Processing https://openalex.org/W3190102834
Processing https://openalex.org/W3192903419
Processing https://openalex.org/W3195123941
Processing https://openalex.org/W3197901707
Processing https://openalex.org/W3198616739
Processing https://openalex.org/W3202153609
Processing https://openalex.org/W3202616165
Processing https://openalex.org/W3202722289
Processing https://openalex.org/W3204672706
Processing https://openalex.org/W3204735205
Processing https://openalex.org/W3207067564
Processing https://openalex.org/

In [None]:
cat_df = pd.DataFrame(cat.items(), columns=['id', 'result'])
cat_df[['method_cat', 'method_type']] = cat_df['result'].apply(lambda x: pd.Series(json.loads(x)))
cat_df = cat_df.drop(columns=['result'])
# cat_df.to_csv(f'{path}/results/methods_categories_617_withTitle.csv', index=False)

In [None]:
estimated_costs_df = pd.DataFrame(estimated_costs.items(), columns=['id', 'cost'])
# estimated_costs_df.to_csv(f'{path}/results/methods_estimated_costs_617_withTitle.csv', index=False)

In [51]:
print(estimated_costs_df['cost'].sum().round(3), 'Euros')

2.288 Euros


## Data Type

In [9]:
import logging

cat = {}
estimated_costs = {}

for i, row in df.iterrows(): 
# for i, row in df[~df.index.isin(cat.keys())].iterrows():  # Only process rows without those already in 'cat'
    abs = row['abs'].strip()
    abs_clean = clean_abs(abs)
    print(f'Processing {row.id}')
    start_time = time.time()
    completion = openai_client.beta.chat.completions.parse(
            model= gpt_model,
            messages=[
                { 
                "role": "assistant",
                "content": "You are a research assistant in the Social Sciences domain. I, the researcher, have an abstract of a paper that studied inter/multi generational wealth/income/earning mobility. I want you to categorize the datasets that the paper used based on the abstract."
                
                },
                {
                    "role": "user", 
                    "content":f""" 
                            ```Title: {row['title']}    
                                               
                            ```Abstracts
                            {abs_clean}
                            ```
                            First, recognize the types of datasets the papers used based on the abstracts. 
                            Does it belong to any of the following categories?
                            1.	Panel/Longitudinal Surveys
                            2.	Administrative/Registry Data
                            3.	National Survey Data
                            4.	Opportunity Atlas
                            5.	Natural/Experimental Data
                            6.	Linked Administrative Data
                            7.	International Panel Data
                            8.	Rich List Data
                            9.	University/Institution Data
                            10.	Pseudo-Panel/Household Budget Survey
                            11.	Archival/Historical Data
                            12.	Big Data
                            13.	No dataset
                            14.	Others
                            
                            Return the category that the type(s) belong to as a number, and the exact dataset name if possible. For example, if it belongs to '1', return also the name such as '2015 UK Longitudinal Survey on Wealth Mobility'. 
                            Don't repeat the name of category here like 'Panel/Longitudinal Surveys - 2015 UK Longitudinal Survey on Wealth Mobility'.
                            If the dataset does not belong to any category, return '14' and the exact dataset name.
                            If the paper does not use any dataset, return '13' and N/A as the exact dataset name.
                            """
                }
            ],
            response_format=GetCat,
        )
    results = completion.choices[0].message.content
    tokens_prompt = completion.usage.prompt_tokens
    tokens_completion = completion.usage.completion_tokens
    cost = estimate_cost(tokens_prompt, tokens_completion, model=gpt_model)
    estimated_costs[i] = cost
    cat[i] = results   
        
logging.info('‚úÖ Done! The process time is:', time.time()-start_time)

Processing https://openalex.org/W2564827629
Processing https://openalex.org/W2117407660
Processing https://openalex.org/W1515121029
Processing https://openalex.org/W1966505070
Processing https://openalex.org/W3041900243
Processing https://openalex.org/W2508091509
Processing https://openalex.org/W4393244952
Processing https://openalex.org/W3125905928
Processing https://openalex.org/W3208235548
Processing https://openalex.org/W2411230938
Processing https://openalex.org/W2912277864
Processing https://openalex.org/W3122728969
Processing https://openalex.org/W3166017557
Processing https://openalex.org/W2049884882
Processing https://openalex.org/W4289222991
Processing https://openalex.org/W1994031345
Processing https://openalex.org/W2460486083
Processing https://openalex.org/W3201233520
Processing https://openalex.org/W2164455384
Processing https://openalex.org/W1994583502
Processing https://openalex.org/W2148814768
Processing https://openalex.org/W2470832101
Processing https://openalex.org/

In [12]:
cat_df = pd.DataFrame(cat.items(), columns=['id', 'result'])
cat_df[['data_cat', 'data_type']] = cat_df['result'].apply(lambda x: pd.Series(json.loads(x)))
cat_df = cat_df.drop(columns=['result'])
cat_df.to_csv(f'{path}/results/datatypes_categories_617_withTitle.csv', index=False)

In [14]:
estimated_costs_df = pd.DataFrame(estimated_costs.items(), columns=['id', 'cost'])
estimated_costs_df.to_csv(f'{path}/results/datatypes_estimated_costs_617_withTitle.csv', index=False)

In [16]:
print(estimated_costs_df['cost'].sum().round(3), 'Euros')

2.14 Euros


## RQ Type

In [17]:
import logging

cat = {}
estimated_costs = {}

for i, row in df.iterrows(): 
# for i, row in df[~df.index.isin(cat.keys())].iterrows():  # Only process rows without those already in 'cat'
    abs = row['abs'].strip()
    abs_clean = clean_abs(abs)
    print(f'Processing {row.id}')
    start_time = time.time()
    completion = openai_client.beta.chat.completions.parse(
            model= gpt_model,
            messages=[
                { 
                "role": "assistant",
                "content": "You are a research assistant in the Social Sciences domain. I, the researcher, have an abstract of a paper that studied inter/multi generational wealth/income/earning mobility. I want you to categorize the research questions that the paper aimed to answer based on the abstract."
                
                },
                {
                    "role": "user", 
                    "content":f"""
                            ```Title: {row['title']}     
                                               
                            ```Abstracts
                            {abs_clean}
                            ```
                            First, recognize the types of research questions the papers aimed to answer based on the abstracts. 
                            Does it belong to any of the following categories?
                            1.	Measurement and Methodological Advances
                            2.	Empirical Estimates and Determinants
                            3.	Policy, Institutional, and Geographic Impacts
                            4.	Intergenerational Wealth Mobility and Inheritance
                            5.	Demographic Differences in Mobility (Race, Gender, etc.)
                            6.	Mobility and Non-Income Outcomes (Health, Wellbeing, etc.)
                            7.	Theoretical and Structural Models
                            8.	Perceptions of Mobility and Attitudes
                            9.	Others
                            
                            Return the category that the research questions belong to as a number, and the exact research question if possible. 
                            If the dataset does not belong to any category, return '9' and the exact research question.                            """
                }
            ],
            response_format=GetCat,
        )
    results = completion.choices[0].message.content
    tokens_prompt = completion.usage.prompt_tokens
    tokens_completion = completion.usage.completion_tokens
    cost = estimate_cost(tokens_prompt, tokens_completion, model=gpt_model)
    estimated_costs[i] = cost
    cat[i] = results   
        
logging.info('‚úÖ Done! The process time is:', time.time()-start_time)

Processing https://openalex.org/W2564827629
Processing https://openalex.org/W2117407660
Processing https://openalex.org/W1515121029
Processing https://openalex.org/W1966505070
Processing https://openalex.org/W3041900243
Processing https://openalex.org/W2508091509
Processing https://openalex.org/W4393244952
Processing https://openalex.org/W3125905928
Processing https://openalex.org/W3208235548
Processing https://openalex.org/W2411230938
Processing https://openalex.org/W2912277864
Processing https://openalex.org/W3122728969
Processing https://openalex.org/W3166017557
Processing https://openalex.org/W2049884882
Processing https://openalex.org/W4289222991
Processing https://openalex.org/W1994031345
Processing https://openalex.org/W2460486083
Processing https://openalex.org/W3201233520
Processing https://openalex.org/W2164455384
Processing https://openalex.org/W1994583502
Processing https://openalex.org/W2148814768
Processing https://openalex.org/W2470832101
Processing https://openalex.org/

In [20]:
cat_df = pd.DataFrame(cat.items(), columns=['id', 'result'])
cat_df[['rq_cat', 'rq_type']] = cat_df['result'].apply(lambda x: pd.Series(json.loads(x)))
cat_df = cat_df.drop(columns=['result'])
cat_df.to_csv(f'{path}/results/rqtypes_categories_617_withTitle.csv', index=False)

In [22]:
estimated_costs_df = pd.DataFrame(estimated_costs.items(), columns=['id', 'cost'])
estimated_costs_df.to_csv(f'{path}/results/datatypes_estimated_costs_617_withTitle.csv', index=False)

In [21]:
print(estimated_costs_df['cost'].sum().round(3), 'Euros')

2.14 Euros


# Difference check (Cohen's Kappa) 

In [19]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')

data_cat_num = {
    "Panel/Longitudinal Surveys": 1,
    "Administrative/Registry Data": 2, 
    "National Survey Data": 3,
    "Opportunity Atlas": 4,
    "Natural/Experimental Data": 5,
    "Linked Administrative Data": 6,
    "International Panel Data": 7,
    "Rich List Data": 8,
    "University/Institution Data": 9,
    "Pseudo-Panel/Household Budget Survey": 10,
    "Archival/Historical Data": 11,
    "Big Data": 12,
    "No dataset": 13,
    "Others": 14
}

rq_cat_num = {
    "Measurement and Methodological Advances": 1,
    "Empirical Estimates and Determinants": 2,
    "Policy, Institutional, and Geographic Impacts": 3,
    "Intergenerational Wealth Mobility and Inheritance": 4,
    "Demographic Differences in Mobility (Race, Gender, etc.)": 5,
    "Mobility and Non-Income Outcomes (Health, Wellbeing, etc.)": 6,
    "Theoretical and Structural Models": 7,
    "Perceptions of Mobility and Attitudes": 8,
    "Others": 9
}
    
    
    
df['data_cat_num'] = df['data_cat'].replace(data_cat_num).astype(int)
df['rq_cat_num'] = df['rq_cat'].replace(rq_cat_num).astype(int)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [76]:
from sklearn.metrics import cohen_kappa_score
import plotly.graph_objects as go

df2 = pd.read_csv(f'{path}/results/methods_categories_617_withTitle.csv')
print(f"‚ö†Ô∏è For index {df2[df2['method_cat'] == 12].index.values}, its value of {df2[df2['method_cat'] == 12]['method_cat'].values} will be changed to 1. \nGiven its description: {df2[df2['method_cat'] == 12]['method_type'].values}")
df2['method_cat'] = df2['method_cat'].replace({12:1}) 

rater1 = df['category_n1']
rater2 = df2['method_cat']
kappa = cohen_kappa_score(rater1, rater2)

fig = go.Figure()
fig.add_trace(go.Histogram(x=rater1, name='Rater 1', opacity=0.5))
fig.add_trace(go.Histogram(x=rater2, name='Rater 2', opacity=0.3))
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    showlegend=True,
    title=f'Histogram of Method Categories (Cohen\'s Kappa: {kappa:.2f})',
    xaxis_title='Method Category',
    yaxis_title='Count',
    barmode='overlay',
    bargap=0.1,
    bargroupgap=0.1,
    font=dict(size=16, color='black'),
    font_family="Times New Roman"
)


‚ö†Ô∏è For index [392], its value of [12] will be changed to 1. 
Given its description: ['Intergenerational Elasticity (IGE) (a regression‚Äêbased measure, Category 1) and rank‚Äêrank slope (a rank‚Äêbased measure, Category 2)']


In [None]:
# fig.write_image(f'{path}/results/Paper/img/methods_categories_CohenKapp_{timestamp}.svg')

In [130]:
df2 = pd.read_csv(f'{path}/results/datatypes_categories_617_withTitle.csv')
unusual = df2[df2['data_cat'] == 0]
print(f"These are unusual categories (IDs: {unusual['id'].values}): \n{unusual} \nWe will take the first values in the description.")

df2.loc[df2['id'] == 537, 'data_cat'] = 1
df2.loc[df2['id'] == 557, 'data_cat'] = 2

rater1 = df['data_cat_num']
rater2 = df2['data_cat']
kappa = cohen_kappa_score(rater1, rater2)

fig = go.Figure()
fig.add_trace(go.Histogram(x=rater1, name='Rater 1', opacity=0.5))
fig.add_trace(go.Histogram(x=rater2, name='Rater 2', opacity=0.3))
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    title=f'Histogram of DataType Categories (Cohen\'s Kappa: {kappa:.2f})',
    xaxis_title='DataType Category',
    yaxis_title='Count',
    barmode='overlay',
    bargap=0.1,
    bargroupgap=0.1,
    font=dict(size=16, color='black'),
    font_family="Times New Roman"
)

These are unusual categories (IDs: [537 557]): 
      id  data_cat                                          data_type
537  537         0  The paper uses two types of datasets. First, i...
557  557         0  2: rich administrative data; 1: Panel Study of... 
We will take the first values in the description.


In [None]:
# fig.write_image(f'{path}/results/Paper/img/datatype_categories_CohenKapp_{timestamp}.svg')

In [131]:
rater1 = df['rq_cat_num']
rater2 = pd.read_csv(f'{path}/results/rqtypes_categories_617_withTitle.csv')['rq_cat']
kappa = cohen_kappa_score(rater1, rater2)

fig = go.Figure()
fig.add_trace(go.Histogram(x=rater1, name='Rater 1', opacity=0.5))
fig.add_trace(go.Histogram(x=rater2, name='Rater 2', opacity=0.3))
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    title=f'Histogram of RQType Categories (Cohen\'s Kappa: {kappa:.2f})',
    xaxis_title='RQType Category',
    yaxis_title='Count',
    barmode='overlay',
    bargap=0.1,
    bargroupgap=0.1,
    font=dict(size=16, color='black'),
    font_family="Times New Roman"
)

In [None]:
# fig.write_image(f'{path}/results/Paper/img/rqtype_categories_CohenKapp_{timestamp}.svg')

`ü´® As expected, RQ cat is not very reliable`

In [20]:
m1 = pd.read_csv(f'{path}/results/methods_categories_617_withTitle.csv').rename(columns={
    'method_cat': 'method_cat_title_num',
    'method_type': 'method_type_title'
})

m1['method_cat_title_num'] = m1['method_cat_title_num'].replace({12:1})

m2 = pd.read_csv(f'{path}/results/datatypes_categories_617_withTitle.csv').rename(columns={
    'data_cat': 'data_cat_title_num',
    'data_type': 'data_type_title'
})
m2.loc[m2['id'] == 537, 'data_cat_title_num'] = 1
m2.loc[m2['id'] == 557, 'data_cat_title_num'] = 2

m3 = pd.read_csv(f'{path}/results/rqtypes_categories_617_withTitle.csv').rename(columns={
    'rq_cat': 'rq_cat_title_num',
    'rq_type': 'rq_type_title'
})

df = df.drop(columns=['id.1', 'index'])
df = pd.concat([df, m1.loc[:, m1.columns != 'id'],
                    m2.loc[:, m2.columns != 'id'],
                    m3.loc[:, m3.columns != 'id']], axis=1)


# Adding the newly assigned version of the properties

In [2]:
new_method = {1: 'Regression‚Äêbased Measures', 2: 'Rank‚Äêbased Measures', 
             3: 'Transition Matrix / Probability Measures', 4: 'Absolute Mobility Measures',
             5: 'Multigenerational Measures', 6: 'Decomposition / Structural Approaches',
             7: 'Non‚Äêparametric Approaches', 8: 'Others'}

new_rq = {1: 'Measurement and Methodological Advances', 2: 'Empirical Estimates and Determinants', 
          3: 'Policy, Institutional, and Geographic Impacts', 4: 'Intergenerational Wealth Mobility and Inheritance', 
          5: 'Demographic Differences in Mobility (Race, Gender, etc.)', 6: 'Mobility and Non-Income Outcomes (Health, Wellbeing, etc.)', 
          7: 'Theoretical and Structural Models', 8: 'Perceptions of Mobility and Attitudes', 9: 'Others'}


new_data = {1: 'Panel/Longitudinal Surveys', 2: 'Administrative/Registry Data',
            3: 'National Survey Data', 4: 'Opportunity Atlas',
            5: 'Natural/Experimental Data', 6: 'Linked Administrative Data',
            7: 'International Panel Data', 8: 'Rich List Data',
            9: 'University/Institution Data', 10: 'Pseudo-Panel/Household Budget Survey',
            11: 'Archival/Historical Data', 12: 'Big Data',
            13: 'No dataset', 14: 'Others'}


df['method_cat_title'] = df['method_cat_title_num'].astype(int).replace(new_method)
df['rq_cat_title'] = df['rq_cat_title_num'].astype(int).replace(new_rq)
df['data_cat_title'] = df['data_cat_title_num'].astype(int).replace(new_data)



NameError: name 'df' is not defined

In [None]:
df['method_cat_title'].unique(), df['data_cat_title'].unique()

(array(['Absolute Mobility Measures', 'Regression‚Äêbased Measures',
        'Transition Matrix / Probability Measures', 'Others',
        'Rank‚Äêbased Measures', 'Decomposition / Structural Approaches',
        'Multigenerational Measures', 'Non‚Äêparametric Approaches'],
       dtype=object),
 array(['Administrative/Registry Data', 'No dataset',
        'Panel/Longitudinal Surveys', 'Others',
        'Linked Administrative Data', 'National Survey Data', 'Big Data',
        'Natural/Experimental Data', 'Opportunity Atlas',
        'International Panel Data', 'Pseudo-Panel/Household Budget Survey',
        'Archival/Historical Data', 'University/Institution Data'],
       dtype=object))

In [None]:
# df.to_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned_withTitle.csv', index=False)

In [3]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned_withTitle.csv')

In [4]:
df2 = pd.read_csv(f'{path}/results/citations_between_ids_g2_exists(year)_0605_1617.csv')
len(df2)

2586

In [5]:
df2.head()

Unnamed: 0,from_id,to_id,weights
0,https://openalex.org/W2900408224,https://openalex.org/W2210747677,1
1,https://openalex.org/W2914094440,https://openalex.org/W2210747677,1
2,https://openalex.org/W3039606322,https://openalex.org/W2210747677,1
3,https://openalex.org/W3128702014,https://openalex.org/W2210747677,1
4,https://openalex.org/W4296277673,https://openalex.org/W2210747677,1


In [6]:
unique_ids = set(df2['from_id']).union(set(df2['to_id']))
len(unique_ids)

469

In [8]:
df.columns

Index(['id', 'title', 'year', 'doi', 'landing_page', 'abstract_inverted_index',
       'language', 'is_oa', 'oa_status', 'oa_link', 'abstract', 'abstract_sm',
       'authors_sm', 'domain', 'sort_gpt_1', 'sort_gpt_2', 'sort_gpt4o_1',
       'sort_gpt4o_2', 'cited_by', 'len_cited_by', 'ref_count',
       'cited_by_count', 'Q1', 'Q1_1', 'Q2', 'Q2_1', 'Q2_2', 'Q3', 'Q4', 'abs',
       'category_n1', 'measure', 'specified', 'relevant', 'category_n2',
       'measure_1', 'measure_2', 'category_3', 'file', 'Question1',
       'Question2', 'rq_cat', 'RQ', 'data_cat', 'data_type', 'type',
       'category_1', 'category_2', 'author_raw_names',
       'author_raw_affiliations', 'topic_display_names', 'topic_scores',
       'field_display_names', 'subfield_display_names', 'data_cat_num',
       'rq_cat_num', 'method_cat_title_num', 'method_type_title',
       'data_cat_title_num', 'data_type_title', 'rq_cat_title_num',
       'rq_type_title', 'method_cat_title', 'rq_cat_title', 'data_cat_title'],

In [15]:
# Instead of creating new nodes for each edge, we will merge the existing nodes with the new titles
# This approach is okay since we are not taking into account the second category of the method for the new ones.
df2 = df2.merge(
    df[['id', 'data_cat_title', 'method_cat_title', 'rq_cat_title', 'year', 'category_1', 'data_cat', 'rq_cat']],
    left_on='from_id', right_on='id', how='left'
).rename(columns={
    'data_cat_title': 'from_data_cat_title',
    'method_cat_title': 'from_method_cat_title',
    'rq_cat_title': 'from_rq_cat_title',
    'year': 'from_year',
    'category_1': 'from_method_cat',
    'data_cat': 'from_data_cat',
    'rq_cat': 'from_rq_cat'
}).drop(columns='id')

df2 = df2.merge(
    df[['id', 'data_cat_title', 'method_cat_title', 'rq_cat_title', 'year', 'category_1', 'data_cat', 'rq_cat']],
    left_on='to_id', right_on='id', how='left'
).rename(columns={
    'data_cat_title': 'to_data_cat_title',
    'method_cat_title': 'to_method_cat_title',
    'rq_cat_title': 'to_rq_cat_title',
    'year': 'to_year',
    'category_1': 'to_method_cat',
    'data_cat': 'to_data_cat',
    'rq_cat': 'to_rq_cat'
}).drop(columns='id')

len(df2)

2586

In [17]:
df2.head()

Unnamed: 0,from_id,to_id,weights,from_data_cat_title,from_method_cat_title,from_rq_cat_title,from_year,from_method_cat,from_data_cat,from_rq_cat,...,from_method_cat.1,from_data_cat.1,from_rq_cat.1,to_data_cat_title,to_method_cat_title,to_rq_cat_title,to_year,to_method_cat,to_data_cat,to_rq_cat
0,https://openalex.org/W2900408224,https://openalex.org/W2210747677,1,Linked Administrative Data,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2018,Regression‚Äêbased Measures,No dataset,Empirical Estimates and Determinants,...,Regression‚Äêbased Measures,No dataset,Empirical Estimates and Determinants,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
1,https://openalex.org/W2914094440,https://openalex.org/W2210747677,1,University/Institution Data,Others,Empirical Estimates and Determinants,2019,Others,University/Institution Data,Empirical Estimates and Determinants,...,Others,University/Institution Data,Empirical Estimates and Determinants,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
2,https://openalex.org/W3039606322,https://openalex.org/W2210747677,1,National Survey Data,Regression‚Äêbased Measures,Intergenerational Wealth Mobility and Inheritance,2020,Regression‚Äêbased Measures,National Survey Data,Intergenerational Wealth Mobility and Inheritance,...,Regression‚Äêbased Measures,National Survey Data,Intergenerational Wealth Mobility and Inheritance,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
3,https://openalex.org/W3128702014,https://openalex.org/W2210747677,1,Panel/Longitudinal Surveys,Others,Measurement and Methodological Advances,2021,Others,Panel/Longitudinal Surveys,Intergenerational Wealth Mobility and Inheritance,...,Others,Panel/Longitudinal Surveys,Intergenerational Wealth Mobility and Inheritance,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
4,https://openalex.org/W4296277673,https://openalex.org/W2210747677,1,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2022,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,...,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants


In [None]:
# df2.to_csv(f'{path}/results/citation_networks_617_withTitle_full.csv', index=False)

In [None]:
sets = ['method', 'data', 'rq']
for set in sets:
   grouped = df2.groupby([f'from_{set}_cat_title', f'to_{set}_cat_title']).size().reset_index(name='weight').sort_values(by='weight', ascending=False)
   grouped.to_csv(f'{path}/results/citation_network_{set}_withTitle_{timestamp}.csv', index=False)

In [None]:
# grouped = df2.groupby(['from_year', 'to_year']).size().reset_index(name='weights').sort_values(by='weights', ascending=False)
# grouped.to_csv(f'{path}/results/citation_network_years_{timestamp}.csv', index=False)

grouped = (
    df2.dropna(subset=['from_year', 'to_year'])  # eliminate bad merges
       .groupby(['from_year', 'to_year'])['weights']  # use original Neo4j weight
       .sum()
       .reset_index()
       .sort_values(by='weights', ascending=False)
)


In [None]:
grouped 

Unnamed: 0,from_year,to_year,weights
365,2024,2014,47
316,2022,2014,46
247,2019,2014,42
292,2021,2014,41
370,2024,2019,38
...,...,...,...
191,2017,1990,1
192,2017,1992,1
1,1981,1979,1
195,2017,2003,1


In [None]:
g1 = pd.read_csv(f"{path}/results/citations_between_years_g2_0605_1616.csv")
g1

Unnamed: 0,from_year,to_year,weights
0,2024,2014,47
1,2022,2014,46
2,2019,2014,42
3,2021,2014,41
4,2024,2019,38
...,...,...,...
382,2022,1990,1
383,2023,1990,1
384,2020,1997,1
385,2009,1999,1


In [None]:
merged = pd.merge(g1, grouped, on=['from_year', 'to_year'], how='outer').fillna(0)
merged['diff'] = merged['weights_x'] - merged['weights_y']

# Show mismatches
merged[merged['diff'] != 0]

Unnamed: 0,from_year,to_year,weights_x,weights_y,diff


## Extracting csv's for viz

In [None]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned_withTitle.csv')
df.columns

Index(['id', 'title', 'year', 'doi', 'landing_page', 'abstract_inverted_index',
       'language', 'is_oa', 'oa_status', 'oa_link', 'abstract', 'abstract_sm',
       'authors_sm', 'domain', 'sort_gpt_1', 'sort_gpt_2', 'sort_gpt4o_1',
       'sort_gpt4o_2', 'cited_by', 'len_cited_by', 'ref_count',
       'cited_by_count', 'Q1', 'Q1_1', 'Q2', 'Q2_1', 'Q2_2', 'Q3', 'Q4', 'abs',
       'category_n1', 'measure', 'specified', 'relevant', 'category_n2',
       'measure_1', 'measure_2', 'category_3', 'file', 'Question1',
       'Question2', 'rq_cat', 'RQ', 'data_cat', 'data_type', 'type',
       'category_1', 'category_2', 'author_raw_names',
       'author_raw_affiliations', 'topic_display_names', 'topic_scores',
       'field_display_names', 'subfield_display_names', 'data_cat_num',
       'rq_cat_num', 'method_cat_title_num', 'method_type_title',
       'data_cat_title_num', 'data_type_title', 'rq_cat_title_num',
       'rq_type_title', 'method_cat_title', 'rq_cat_title', 'data_cat_title'],

In [87]:
# swamp_cat1 = df.groupby(['year', 'method_cat_title']).size().reset_index(name='count').sort_values(by='year', ascending=True)
# swamp_cat1.to_csv(f'{path}/results/methods_title_over_years617.csv', index=False)
swamp_cat1 = df.groupby(['year', 'category_1']).size().reset_index(name='count').sort_values(by='year', ascending=True)


In [88]:
import plotly.express as px

fig = px.bar(swamp_cat1, 
             x='year', 
             y='count', 
            #  color='method_cat_title', 
            color = 'category_1',
             title='Method Categories Over Years (617 Studies)',
             labels={'year': 'Year', 'count': 'Count'},
             color_discrete_sequence=px.colors.qualitative.Pastel)

fig.show()

In [89]:
import plotly.express as px

# For Methods (Categories)
color_palette = {"Regression‚Äêbased Measures": "#B5DBA5",
                 "Rank‚Äêbased Measures": "#F4A988",
                 "Transition Matrix / Probability Measures": "#CB7729",
                 "Absolute Mobility Measures": "#C04E3F",
                 "Multigenerational Measures": "#FFC3C2",
                 "Decomposition / Structural Approaches": "#339999",
                 "Non‚Äêparametric Approaches": "#005F56",
                 "Others": "#F4E1A1"}

data_sub = swamp_cat1[swamp_cat1['year'] >= 2010].copy()
data_sub['relative_count'] = data_sub.groupby('year')['count'].transform(lambda x: x / x.sum())

fig = px.bar(data_sub, 
             x='year', 
             y='relative_count', 
            #  color='method_cat_title',
            color = 'category_1',
            #  title='Method Categories Over Years (617 Studies)',
             labels={'year': 'Year', 'count': 'Count'},
             color_discrete_map=color_palette)
fig.update_layout(
    plot_bgcolor='rgba(255,255,255,0)',
    paper_bgcolor='rgba(255,255,255,0)',
    xaxis_title='Year',
    yaxis_title='Relative Frequency (per year)',
    # legend_title_text='Method Category',
    showlegend=False,
    font=dict(size=16, color='black'),
    font_family="Times New Roman",
    barmode='stack'
)

fig.show()

In [90]:
# Compute 3-year moving average of absolute frequencies (count) for each method_cat_title
# data_sub_sorted = data_sub.sort_values(['method_cat_title', 'year'])
data_sub_sorted = swamp_cat1.sort_values(['category_1', 'year'])
data_sub_sorted['ma_3yr_abs'] = (
    data_sub_sorted
    # .groupby('method_cat_title')['count']
    .groupby('category_1')['count']
    .transform(lambda x: x.rolling(window=3, min_periods=1, center=True).mean())
)

data_sub_sorted = data_sub_sorted[data_sub_sorted['year'] >= 2010].copy()  # Filter for years >= 2010
# For each year, compute the sum of moving averages across all categories
year_ma_sum = data_sub_sorted.groupby('year')['ma_3yr_abs'].transform('sum')

# Convert to relative frequency per year
data_sub_sorted['ma_3yr_rel'] = data_sub_sorted['ma_3yr_abs'] / year_ma_sum * 100

fig = px.bar(
    data_sub_sorted,
    x='year',
    y='ma_3yr_rel',
    # color='method_cat_title',
    color='category_1',
    labels={'year': 'Year', 'ma_3yr_rel': '3-Year Moving Avg. %'},
    color_discrete_map=color_palette
)
fig.update_layout(
    plot_bgcolor='rgba(255,255,255,0)',
    xaxis_title='Year',
    yaxis_title='3-Year Moving Avg. % (per year)',
    showlegend=False,
    font=dict(size=16, color='black'),
    font_family="Times New Roman",
    barmode='stack'
)
fig.show()


In [None]:
# fig.write_image(f'{path}/results/Paper/img/methods_title_between2010-25_617_rollingavg.svg')

In [54]:
data = pd.read_csv(f'{path}/results/citation_networks_617_withTitle_full.csv')
data.head()

Unnamed: 0,from_id,to_id,weights,from_data_cat_title,from_method_cat_title,from_rq_cat_title,from_year,from_method_cat,from_data_cat,from_rq_cat,...,from_method_cat.1,from_data_cat.1,from_rq_cat.1,to_data_cat_title.1,to_method_cat_title.1,to_rq_cat_title.1,to_year.1,to_method_cat,to_data_cat,to_rq_cat
0,https://openalex.org/W2900408224,https://openalex.org/W2210747677,1,Linked Administrative Data,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2018,Regression‚Äêbased Measures,No dataset,Empirical Estimates and Determinants,...,Regression‚Äêbased Measures,No dataset,Empirical Estimates and Determinants,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
1,https://openalex.org/W2914094440,https://openalex.org/W2210747677,1,University/Institution Data,Others,Empirical Estimates and Determinants,2019,Others,University/Institution Data,Empirical Estimates and Determinants,...,Others,University/Institution Data,Empirical Estimates and Determinants,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
2,https://openalex.org/W3039606322,https://openalex.org/W2210747677,1,National Survey Data,Regression‚Äêbased Measures,Intergenerational Wealth Mobility and Inheritance,2020,Regression‚Äêbased Measures,National Survey Data,Intergenerational Wealth Mobility and Inheritance,...,Regression‚Äêbased Measures,National Survey Data,Intergenerational Wealth Mobility and Inheritance,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
3,https://openalex.org/W3128702014,https://openalex.org/W2210747677,1,Panel/Longitudinal Surveys,Others,Measurement and Methodological Advances,2021,Others,Panel/Longitudinal Surveys,Intergenerational Wealth Mobility and Inheritance,...,Others,Panel/Longitudinal Surveys,Intergenerational Wealth Mobility and Inheritance,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants
4,https://openalex.org/W4296277673,https://openalex.org/W2210747677,1,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2022,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,...,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Panel/Longitudinal Surveys,Regression‚Äêbased Measures,Empirical Estimates and Determinants,2017,Regression‚Äêbased Measures,Panel/Longitudinal Surveys,Empirical Estimates and Determinants


In [20]:
data_sub2 = data.groupby(['from_year', 'to_year', 'from_method_cat', 'to_method_cat']).size().reset_index(name='weights').sort_values(by='weights', ascending=False)
data_sub2

Unnamed: 0,from_year,to_year,from_method_cat,to_method_cat,weights
456,2018,2014,Regression‚Äêbased Measures,Rank‚Äêbased Measures,22
1053,2024,2014,Regression‚Äêbased Measures,Rank‚Äêbased Measures,19
542,2019,2014,Regression‚Äêbased Measures,Rank‚Äêbased Measures,16
532,2019,2013,Regression‚Äêbased Measures,Regression‚Äêbased Measures,14
450,2018,2013,Regression‚Äêbased Measures,Regression‚Äêbased Measures,13
...,...,...,...,...,...
488,2019,2002,Others,Regression‚Äêbased Measures,1
490,2019,2003,Non‚Äêparametric Approaches,Regression‚Äêbased Measures,1
492,2019,2003,Regression‚Äêbased Measures,Transition Matrix / Probability Measures,1
494,2019,2005,Non‚Äêparametric Approaches,Regression‚Äêbased Measures,1


In [94]:
# Use 'cited' or 'citing'
invest = 'from' # or 'to'
most_cited = data_sub2.groupby([f'to_year', f'{invest}_method_cat'], group_keys=False).agg({'weights': 'sum'}).reset_index()
most_cited['percentage'] = most_cited.groupby(f'to_year')['weights'].transform(lambda x: x / x.sum())
most_cited = most_cited.groupby(f'to_year').apply(lambda x: x.nlargest(3, 'weights')).reset_index(drop=True)
most_cited
most_cited.to_csv(f'{path}/results/citation_networks_617_most_cited_by_{invest}_cat_to_year.csv', index=False)





In [None]:
# # Create a mapping of (from_val, to_val) pairs with their associated weight, repeated per each row
# from_to_weights = data[
#     ['from_method_cat_title', 'to_method_cat_title',
#      'from_data_cat_title', 'to_data_cat_title',
#      'from_rq_cat_title', 'to_rq_cat_title', 'weights']
# ]

# # Now, reshape each pair (from, to) to long format with weight preserved
# pairs = []
# for f_col, t_col in [('from_method_cat_title', 'to_method_cat_title'),
#                      ('from_data_cat_title', 'to_data_cat_title'),
#                      ('from_rq_cat_title', 'to_rq_cat_title')]:
#     temp_df = data[[f_col, t_col, 'weights']].copy()
#     temp_df.columns = ['from_val', 'to_val', 'weight']
#     pairs.append(temp_df)

# # Combine all types into one long dataframe and aggregate
# long_weighted_df = pd.concat(pairs, ignore_index=True)

# # Group by (from_val, to_val) and sum the weights
# aggregated_df = long_weighted_df.groupby(['from_val', 'to_val'], as_index=False)['weight'].sum()
# aggregated_df


Unnamed: 0,from_val,to_val,weight
0,Absolute Mobility Measures,Absolute Mobility Measures,26
1,Absolute Mobility Measures,Decomposition / Structural Approaches,7
2,Absolute Mobility Measures,Multigenerational Measures,1
3,Absolute Mobility Measures,Non‚Äêparametric Approaches,1
4,Absolute Mobility Measures,Others,4
...,...,...,...
195,Transition Matrix / Probability Measures,Regression‚Äêbased Measures,85
196,Transition Matrix / Probability Measures,Transition Matrix / Probability Measures,12
197,University/Institution Data,Linked Administrative Data,2
198,University/Institution Data,No dataset,5


# also about the citing papers' methods over time?


In [9]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned_withTitle.csv')
df.columns

Index(['id', 'title', 'year', 'doi', 'landing_page', 'abstract_inverted_index',
       'language', 'is_oa', 'oa_status', 'oa_link', 'abstract', 'abstract_sm',
       'authors_sm', 'domain', 'sort_gpt_1', 'sort_gpt_2', 'sort_gpt4o_1',
       'sort_gpt4o_2', 'cited_by', 'len_cited_by', 'ref_count',
       'cited_by_count', 'Q1', 'Q1_1', 'Q2', 'Q2_1', 'Q2_2', 'Q3', 'Q4', 'abs',
       'category_n1', 'measure', 'specified', 'relevant', 'category_n2',
       'measure_1', 'measure_2', 'category_3', 'file', 'Question1',
       'Question2', 'rq_cat', 'RQ', 'data_cat', 'data_type', 'type',
       'category_1', 'category_2', 'author_raw_names',
       'author_raw_affiliations', 'topic_display_names', 'topic_scores',
       'field_display_names', 'subfield_display_names', 'data_cat_num',
       'rq_cat_num', 'method_cat_title_num', 'method_type_title',
       'data_cat_title_num', 'data_type_title', 'rq_cat_title_num',
       'rq_type_title', 'method_cat_title', 'rq_cat_title', 'data_cat_title'],

In [14]:
df[['Question1', 'Question2']].value_counts()

Question1  Question2
Yes        Yes          617
Name: count, dtype: int64

In [None]:
# import logging

# cat = {}
# estimated_costs = {}

# for i, row in df.iterrows():
# # for i, row in df[~df.index.isin(cat.keys())].iterrows():  # Only process rows without those already in 'cat'
#     abs = row['abs'].strip()
#     abs_clean = clean_abs(abs)
#     measure = row['method_cat_title']
#     print(f'Processing {row.id}')
#     start_time = time.time()
#     completion = openai_client.beta.chat.completions.parse(
#                 model= gpt_model,
#                 messages=[
#                     { 
#                     "role": "assistant",
#                     "content": "You are a research assistant in the Social Sciences domain. I, the researcher, have an abstract of a paper that studied inter/multi generational wealth/income/earning mobility."
                    
#                     },
#                     {
#                         "role": "user", 
#                         "content":f"""
#                                 ```Title: {row['title']}
                                                        
#                                 ```Abstracts
#                                 {abs_clean}
#                                 ```
#                                 Based on the ```Title and the ```Abstracts above, you recognized that this paper used {measure}.
#                                 Does this paper use this measure in the following way?
#                                 1. It used it alone to estimate the intergenerational wealth/income/earning mobility.
#                                 2. It used it to estimate something else
#                                 2. It used it in combination with other measures, and the paper specifies which measures. The name of the measures are: [list of measures]. Return None if the paper does not specify the name of the other measures.
#                                 2-1. Which measure(s) does it prefer? If you cannot learn from the abstract, return None.
#                                 3. Does it
                                
#                                 """
#                     }
#                 ],
#                 response_format=,
#             )
#     results = completion.choices[0].message.content
#     tokens_prompt = completion.usage.prompt_tokens
#     tokens_completion = completion.usage.completion_tokens
#     cost = estimate_cost(tokens_prompt, tokens_completion, model=gpt_model)
#     estimated_costs[i] = cost
#     cat[i] = results   
        
# logging.info('‚úÖ Done! The process time is:', time.time()-start_time)