In [19]:
import pandas as pd
import numpy as np
import os
from datetime import date
today = date.today()
path = os.path.dirname(os.getcwd())
print(f'üìÇ Current working directory: {path}')
print(f'üíö Today is {today}')

üìÇ Current working directory: /Users/serenekim/Desktop/PhD/meta-wealth_mobility
üíö Today is 2025-08-19


In [20]:
# For graph plotting
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))
import scripts.graph as gr

# Neo4j Implementation of the Graph API

## Data Prep

In [2]:
# Load your CSV
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs.csv')
df.columns

Index(['id', 'title', 'year', 'doi', 'landing_page', 'abstract_inverted_index',
       'language', 'is_oa', 'oa_status', 'oa_link', 'abstract', 'abstract_sm',
       'authors_sm', 'domain', 'sort_gpt_1', 'sort_gpt_2', 'sort_gpt4o_1',
       'sort_gpt4o_2', 'cited_by', 'len_cited_by', 'ref_count',
       'cited_by_count', 'Q1', 'Q1_1', 'Q2', 'Q2_1', 'Q2_2', 'Q3', 'Q4', 'abs',
       'index', 'category_n1', 'measure', 'specified', 'relevant',
       'category_n2', 'measure_1', 'measure_2', 'category_3', 'file', 'id.1',
       'Question1', 'Question2', 'rq_cat', 'RQ', 'data_cat', 'data_type',
       'type', 'category_1', 'category_2', 'author_raw_names',
       'author_raw_affiliations', 'topic_display_names', 'topic_scores',
       'field_display_names', 'subfield_display_names'],
      dtype='object')

In [3]:
df[df['data_cat']==0][['data_cat', 'data_type']].values # This shouldn't be 0!

array([[0,
        'European Union Statistics on Income and Living Conditions survey: 3; European Community Household Panel: 7']],
      dtype=object)

In [4]:
df['data_cat'] = df['data_cat'].replace({0: 3})
df[df['data_cat'] == 0][['data_cat', 'data_type']].values

array([], shape=(0, 2), dtype=object)

In [5]:
new_rq = {1: 'Measurement and Methodological Advances', 2: 'Empirical Estimates and Determinants', 
          3: 'Policy, Institutional, and Geographic Impacts', 4: 'Intergenerational Wealth Mobility and Inheritance', 
          5: 'Demographic Differences in Mobility (Race, Gender, etc.)', 6: 'Mobility and Non-Income Outcomes (Health, Wellbeing, etc.)', 
          7: 'Theoretical and Structural Models', 8: 'Perceptions of Mobility and Attitudes', 9: 'Others'}


new_data = {1: 'Panel/Longitudinal Surveys', 2: 'Administrative/Registry Data',
            3: 'National Survey Data', 4: 'Opportunity Atlas',
            5: 'Natural/Experimental Data', 6: 'Linked Administrative Data',
            7: 'International Panel Data', 8: 'Rich List Data',
            9: 'University/Institution Data', 10: 'Pseudo-Panel/Household Budget Survey',
            11: 'Archival/Historical Data', 12: 'Big Data',
            13: 'No dataset', 14: 'Others'}

df['rq_cat'] = df['rq_cat'].replace(new_rq)
df['data_cat'] = df['data_cat'].replace(new_data)
df['rq_cat'] = df['rq_cat'].astype('category')
df['data_cat'] = df['data_cat'].astype('category')

df[['rq_cat', 'data_cat']].head(5)

Unnamed: 0,rq_cat,data_cat
0,Intergenerational Wealth Mobility and Inheritance,Linked Administrative Data
1,Empirical Estimates and Determinants,No dataset
2,Empirical Estimates and Determinants,No dataset
3,Measurement and Methodological Advances,Panel/Longitudinal Surveys
4,Empirical Estimates and Determinants,Others


In [6]:
df['type'] = df['type'].replace({None: 'Not_specified'})

In [7]:
df[['year', 'rq_cat', 'data_cat', 'type', 'category_1', 'category_2']].isna().sum()

year            0
rq_cat          0
data_cat        0
type            0
category_1      0
category_2    371
dtype: int64

In [19]:
# df.to_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv', index=False)

## Create the graph (Abstract-based properties)
‚ö†Ô∏è Neo4j forces a direction flag, but you‚Äôre free to treat edges as undirected by querying with -[]- or projecting undirected copies for analytics. Keep attributes in a star around each :Paper; derive feature-to-feature links later. This keeps the graph lean, the semantics clear, and makes downstream analysis with GDS much simpler.

In [21]:
df = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')

In [22]:
df[['id','cited_by']].head()

Unnamed: 0,id,cited_by
0,https://openalex.org/W2564827629,"https://openalex.org/W2763922368, https://open..."
1,https://openalex.org/W2117407660,"https://openalex.org/W1966248378, https://open..."
2,https://openalex.org/W1515121029,"https://openalex.org/W1984539215, https://open..."
3,https://openalex.org/W1966505070,"https://openalex.org/W1577807578, https://open..."
4,https://openalex.org/W3041900243,"https://openalex.org/W4386370708, https://open..."


In [23]:
df_citations = df.dropna(subset=['cited_by'])
df_citations = df_citations[['id', 'cited_by']]
df_citations['cited_by'] = df_citations['cited_by'].apply(lambda x: [s.strip() for s in x.split(',')])
df_citations = df_citations.explode('cited_by')
df_citations = df_citations.rename(columns={'cited_by': 'source_id', "id": "target_id"})

valid_ids = set(df['id'])
df_citations = df_citations[df_citations['source_id'].isin(valid_ids)]

print(f'\nNumber of citations: {len(df_citations)}')
df_citations.head(10)



Number of citations: 2596


Unnamed: 0,target_id,source_id
0,https://openalex.org/W2564827629,https://openalex.org/W3123816425
0,https://openalex.org/W2564827629,https://openalex.org/W4289222991
0,https://openalex.org/W2564827629,https://openalex.org/W4386370708
0,https://openalex.org/W2564827629,https://openalex.org/W3163153651
0,https://openalex.org/W2564827629,https://openalex.org/W4393244952
0,https://openalex.org/W2564827629,https://openalex.org/W4407152405
0,https://openalex.org/W2564827629,https://openalex.org/W2947666646
0,https://openalex.org/W2564827629,https://openalex.org/W3022704615
0,https://openalex.org/W2564827629,https://openalex.org/W4280580662
0,https://openalex.org/W2564827629,https://openalex.org/W4283657131


In [24]:
import pandas as pd
from neo4j import GraphDatabase

# Connect to Neo4j
driver = GraphDatabase.driver("bolt://localhost:7690", auth=("neo4j", "your_password"))
df['category_1'] = df['category_1'].replace({'Others': 'Others_Measure'})
df['category_2'] = df['category_2'].replace({'Others': 'Others_Measure'})
df['data_cat'] = df['data_cat'].replace({'Others': 'Others_DataType'})
df['rq_cat'] = df['rq_cat'].replace({'Others': 'Others_RqType'})


with driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (y:Year) REQUIRE y.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Measure) REQUIRE m.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (ds:DataType) REQUIRE ds.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:RqType) REQUIRE r.name IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (tp:Type) REQUIRE tp.name IS UNIQUE")

def safe_str(val):
    """Return string if not null/NaN, else None."""
    if pd.isna(val) or str(val).strip().lower() in {"", "nan", "none"}:
        return None
    return str(val).strip()

def create_graph(tx, row):
    # Sanitize all inputs
    article_id = safe_str(row.get("id"))
    year = int(row["year"]) if not pd.isna(row["year"]) else None
    measure1 = safe_str(row.get("category_1"))
    measure2 = safe_str(row.get("category_2"))
    datatype = safe_str(row.get("data_cat"))
    rqtype = safe_str(row.get("rq_cat"))
    type = safe_str(row.get("type"))

    # Skip the row if any required values are None
    if not all([article_id, year, datatype, rqtype, type]):
        print(f"[SKIP] Missing required data in row: {row.get('id')}")
        return  # skip this row

    # Skip measure1 or measure2 if they're None, and adjust the Cypher
    query = """
    MERGE (p:Paper {id: $article_id})
    MERGE (y:Year {name: $year})
    MERGE (m1:Measure {name: $measure1})
    MERGE (ds:DataType {name: $datatype})
    MERGE (rq:RqType {name: $rqtype}) 
    MERGE (tp:Type {name: $type})

    // Supernodes 
    MERGE (p)-[:HAS_YEAR]->(y)
    MERGE (p)-[:HAS_TYPE]->(tp)
    MERGE (p)-[:HAS_DATATYPE]->(ds)
    MERGE (p)-[:HAS_RQTYPE]->(rq)
    MERGE (p)-[:HAS_MEASURE]->(m1)

    // Properties
    MERGE (y)-[:PERFORMS {article_id: $article_id}]->(tp)
    MERGE (tp)-[:USES {article_id: $article_id}]->(m1)
    MERGE (m1)-[:APPLIED_WITH {article_id: $article_id}]->(ds)
    MERGE (ds)-[:TO_ANSWER {article_id: $article_id}]->(rq)
    MERGE (rq)-[:IS_ANSWERED_IN {article_id: $article_id}]->(y)
    """
    
    if measure2:
        query += "\nMERGE (m2:Measure {name: $measure2})\nMERGE (p)-[:HAS_MEASURE]->(m2)\nMERGE (tp)-[:USES {article_id: $article_id}]->(m2)\nMERGE (m2)-[:APPLIED_WITH {article_id: $article_id}]->(ds)"

    tx.run(query, {
        "year": year,
        "measure1": measure1,
        "measure2": measure2,
        "datatype": datatype,
        "rqtype": rqtype,
        "type": type,
        "article_id": article_id,
    })
    

# Loop through DataFrame and create graph
with driver.session() as session:
    for _, row in df.iterrows():
        session.execute_write(create_graph, row)



# Create the citation relationships
with driver.session() as session:
    query = """
    UNWIND $rows AS row
    MATCH (a:Paper {id: row.source_id})
    MATCH (b:Paper {id: row.target_id})
    MERGE (a)-[:CITES]->(b)
    """
    session.run(query, parameters={"rows": df_citations.to_dict("records")})

driver.close()


# Graph Structure Analysis

* After adding the year nodes, it looks more concentrated.

In [7]:
df_edges = pd.read_csv(f"{path}/results/weights_properties_only_noTypeYear.csv")

fig = gr.plot_feature_graph_single(
        df_edges,
        title="Feature Citation Graph with Discrete Colours and Legend"
        )
fig.show()


### Zoom into the center?

In [13]:
df = pd.read_csv(f'{path}/results/weights_properties_only_noTypeYear.csv')
zoom_in_values = ['Regression-based Measures','Rank‚Äêbased Measures', 'Transition Matrix / Probability Measures', 'Absolute Mobility Measures', 
                  'Others_Measure', 'Decomposition / Structural Approaches',
                  'Panel/Longitudinal Surveys', 'Administrative/Registry Data', 'National Survey Data', 'Linked Administrative Data',
                  'International Panel Data', 'Others_DataType']

df_zoom = df[df['from_val'].isin(zoom_in_values) & df['to_val'].isin(zoom_in_values)]
fig_zoom = gr.plot_feature_graph_single(df_zoom, title="Zoom-in on Core Properties")
fig_zoom.show()


Measures vs. Data types construction haha ^^;
‚ö†Ô∏è So actually, not considering the citation it not interesting at all because then the weights are just how many times certain nodes are mentioned, and the relationships between the nodes are just the "node types" relationships.

### Citations considered to create weights

In [17]:
df_feature_edges = pd.read_csv(f'{path}/results/weights_properties_citations_noTypeYear.csv')
fig_citation = gr.plot_feature_graph_single(df_feature_edges, title="Feature Citations", color_labels=gr.color_labels)
fig_citation.show()


In [None]:
# fig.write_image(f'{path}/results/Paper/img/feature_citation_graph.pdf')

# By setting a radius around the centroid (0,0)

In [None]:
df_feature_edges = pd.read_csv(f'{path}/results/weights_properties_citations_noTypeYear.csv')
fig_geom, fig_betw, fig_deg = gr.plot_centrality_subgraphs_single(df_feature_edges, gr.color_labels)
fig_geom.show()
fig_betw.show()
fig_deg.show()



FileNotFoundError: [Errno 2] No such file or directory: '/Users/serenekim/Desktop/PhD/meta-wealth_mobility/results/weights_properties_citations_noTypeYear_f1<>f2.csv'

In [None]:
# fig_geom.write_image(f'{path}/results/Paper/img/feature_geom_center_graph.pdf')
# fig_betw.write_image(f'{path}/results/Paper/img/feature_top_betweenness_graph.pdf')
# fig_deg.write_image(f'{path}/results/Paper/img/feature_top_degree_graph.pdf')

# Check the papers with the centroid properties

In [2]:
data = pd.read_csv(f'{path}/data_abstracts/true_mobility_studies_617_forKGs_cleaned.csv')

In [138]:
data.columns

Index(['id', 'title', 'year', 'doi', 'landing_page', 'abstract_inverted_index',
       'language', 'is_oa', 'oa_status', 'oa_link', 'abstract', 'abstract_sm',
       'authors_sm', 'domain', 'sort_gpt_1', 'sort_gpt_2', 'sort_gpt4o_1',
       'sort_gpt4o_2', 'cited_by', 'len_cited_by', 'ref_count',
       'cited_by_count', 'Q1', 'Q1_1', 'Q2', 'Q2_1', 'Q2_2', 'Q3', 'Q4', 'abs',
       'index', 'category_n1', 'measure', 'specified', 'relevant',
       'category_n2', 'measure_1', 'measure_2', 'category_3', 'file', 'id.1',
       'Question1', 'Question2', 'rq_cat', 'RQ', 'data_cat', 'data_type',
       'type', 'category_1', 'category_2', 'author_raw_names',
       'author_raw_affiliations', 'topic_display_names', 'topic_scores',
       'field_display_names', 'subfield_display_names'],
      dtype='object')

In [139]:
data['category_1'].unique()

array(['Absolute Mobility Measures', 'Regression‚Äêbased Measures',
       'Transition Matrix / Probability Measures', 'Others',
       'Rank‚Äêbased Measures', 'Decomposition / Structural Approaches',
       'Multigenerational Measures', 'Non‚Äêparametric Approaches'],
      dtype=object)

In [140]:
print(data[(data['data_cat'] == "Panel/Longitudinal Surveys") & (data['rq_cat'] == "Empirical Estimates and Determinants") & (data['category_1'] == "Regression‚Äêbased Measures")].shape)
data[(data['data_cat'] == "Panel/Longitudinal Surveys") & (data['rq_cat'] == "Empirical Estimates and Determinants") & (data['category_1'] == "Regression‚Äêbased Measures")][['id', 'title', 'year', 'data_cat', 'rq_cat', 'category_1', 'category_2']]

(47, 56)


Unnamed: 0,id,title,year,data_cat,rq_cat,category_1,category_2
9,https://openalex.org/W2411230938,Income Mobility in the United States,2016,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
20,https://openalex.org/W2148814768,Has the Intergenerational Transmission of Econ...,2005,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
24,https://openalex.org/W2802089178,Can cultural consumption increase future earni...,2018,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
46,https://openalex.org/W4238028113,Status Traps,2016,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
72,https://openalex.org/W4308132439,Socioeconomic Mobility of Return Migrants: Evi...,2022,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
73,https://openalex.org/W1020822704,"Poor dad, poor child? An investigation of inte...",2015,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
75,https://openalex.org/W1444704346,Intergenerational Earnings Elasticity Revisite...,2015,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
93,https://openalex.org/W1580804840,Is a College Degree Still the Great Equalizer?...,2011,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
114,https://openalex.org/W1963513809,Patterns of Intergenerational Mobility in Inco...,1992,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,
123,https://openalex.org/W1988669335,THE INTERGENERATIONAL CORRELATION BETWEEN CHIL...,1990,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,


In [141]:
print(data[(data['data_cat'] == "Panel/Longitudinal Surveys") & (data['rq_cat'] == "Intergenerational Wealth Mobility and Inheritance") & (data['category_1'] == "Regression‚Äêbased Measures")].shape)
print(data[(data['data_cat'] == "Panel/Longitudinal Surveys") & (data['rq_cat'] == "Intergenerational Wealth Mobility and Inheritance") & (data['category_2'] == "Regression‚Äêbased Measures")].shape)

(47, 56)
(6, 56)


In [158]:
data['citation_rank'] = data['cited_by_count'].rank(method='first', ascending=False)
data['citation_rank']

0        4.0
1       22.0
2       34.0
3      132.0
4      217.0
       ...  
612    614.0
613    120.0
614    615.0
615    616.0
616    617.0
Name: citation_rank, Length: 617, dtype: float64

In [180]:
citations_617 = pd.read_csv(f'{path}/results/citations_count_617only.csv')
citations_617 = citations_617.rename(columns={'paper': 'id'})
citations_617 = citations_617.merge(data[['id', 'title', 'data_cat', 'rq_cat', 'category_1', 'category_2', 'citation_rank', 'cited_by_count']], on='id', how='left')
citations_617['citation_rank_617'] = citations_617['citations_count'].rank(method='first', ascending=False)
citations_617.head(10)

Unnamed: 0,id,citations_count,title,data_cat,rq_cat,category_1,category_2,citation_rank,cited_by_count,citation_rank_617
0,https://openalex.org/W2105584013,169,Where is the land of Opportunity? The Geograph...,Linked Administrative Data,Empirical Estimates and Determinants,Rank‚Äêbased Measures,Transition Matrix / Probability Measures,1.0,2341.0,1.0
1,https://openalex.org/W2022256544,150,An Equilibrium Theory of the Distribution of I...,No dataset,Theoretical and Structural Models,Decomposition / Structural Approaches,Decomposition / Structural Approaches,2.0,2152.0,2.0
2,https://openalex.org/W2122216841,131,"Income Inequality, Equality of Opportunity, an...",International Panel Data,Empirical Estimates and Determinants,Regression‚Äêbased Measures,,3.0,1529.0,3.0
3,https://openalex.org/W2091916425,112,Fortunate Sons: New Estimates of Intergenerati...,Administrative/Registry Data,Empirical Estimates and Determinants,Regression‚Äêbased Measures,,6.0,654.0,4.0
4,https://openalex.org/W3093752946,98,Trends in Intergenerational Income Mobility,Panel/Longitudinal Surveys,Empirical Estimates and Determinants,Regression‚Äêbased Measures,Regression‚Äêbased Measures,12.0,445.0,5.0
5,https://openalex.org/W2119587652,93,Cross-Country Differences in Intergenerational...,No dataset,Empirical Estimates and Determinants,Regression‚Äêbased Measures,,5.0,660.0,6.0
6,https://openalex.org/W2155246844,74,Is the United States Still a Land of Opportuni...,Linked Administrative Data,Empirical Estimates and Determinants,Rank‚Äêbased Measures,Rank‚Äêbased Measures,7.0,619.0,7.0
7,https://openalex.org/W2082418718,72,Intergenerational Income Mobility Among Daughters,Panel/Longitudinal Surveys,Intergenerational Wealth Mobility and Inheritance,Regression‚Äêbased Measures,,15.0,401.0,8.0
8,https://openalex.org/W3121694509,67,The Intergenerational Earnings and Income Mobi...,Linked Administrative Data,Intergenerational Wealth Mobility and Inheritance,Regression‚Äêbased Measures,Regression‚Äêbased Measures,17.0,299.0,9.0
9,https://openalex.org/W2288802977,58,CROSS‚ÄêCOUNTRY RANKINGS IN INTERGENERATIONAL MO...,No dataset,Empirical Estimates and Determinants,Regression‚Äêbased Measures,,13.0,423.0,10.0


In [None]:
# citations_617.to_csv(f'{path}/results/citations_count_617only_with_ranks.csv', index=False)

In [3]:
citations_617 = pd.read_csv(f'{path}/results/citations_count_617only_with_ranks.csv')
len(citations_617)

276

In [None]:
# Using go.Heatmap for more visualization options (instead of go.Histogram2d) ------------------------
import plotly.graph_objects as go

# Create bin edges
variable = 'rq_cat'
citations_617['percentile_bin'] = pd.qcut(
    citations_617['citation_rank_617'],
    q=10,
    labels=[f"P{i*10}-{(i+1)*10}" for i in range(10)],
    duplicates='drop'
)
y_labels = citations_617[f'{variable}']

# Create crosstab
heatmap_data = pd.crosstab(y_labels, citations_617['percentile_bin'], normalize='columns')

fig = go.Figure(
    go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns.astype(str),
        y=heatmap_data.index,
        colorscale='Viridis',
        showscale=True,
        zmin=0,
        # zmax=heatmap_data.values.max(),
        zmax = 0.8,
        hoverongaps=False,
        xgap=2,  # horizontal border space
        ygap=2,  # vertical border space
        colorbar=dict(title='Probability Density')
    )
)

fig.update_layout(
    title=f"Distribution of {variable} across Citation Percentiles",
    xaxis_title="Citation Rank Percentile",
    yaxis_title=f"{variable}",
    # yaxis=dict(showticklabels=False) 
)

fig.show()


### Using Citations_between_617 as weights

In [70]:
import plotly.graph_objects as go
import pandas as pd

# Define the variables you want dropdown control for
variables = ['category_1', 'category_2', 'data_cat', 'rq_cat']

# Bin citation rank
citations_617['percentile_bin'] = pd.qcut(
    citations_617['citation_rank_617'],
    q=10,
    labels=[f"P{i*10}-{(i+1)*10}" for i in range(10)],
    duplicates='drop'
)

# Create color scale
custom_colorscale = [
    [0.0, 'lightgray'],
    [0.000001, 'rgb(68,1,84)'],
    [1.0, 'rgb(253,231,37)']
]

# Store all heatmap traces
traces = []
buttons = []

for i, variable in enumerate(variables):
    pivot_table = (
        citations_617
        .groupby([variable, 'percentile_bin'], observed=True)['citations_count']
        .sum()
        .unstack(fill_value=0)
    )
    pivot_table = pivot_table / pivot_table.values.sum()

    trace = go.Heatmap(
        z=pivot_table.values,
        x=pivot_table.columns.astype(str),
        y=pivot_table.index,
        colorscale=custom_colorscale,
        zmin=0.00001,
        zmax=0.05,  # or pivot_table.values.max() if you want adaptive range
        xgap=2,
        ygap=2,
        visible=(i == 0),  # Only first is visible initially
        colorbar=dict(title="Normalized Citations")
    )
    traces.append(trace)

    # Add dropdown button
    button = dict(
        label=variable,
        method='update',
        args=[
            {'visible': [j == i for j in range(len(variables))]},
            {'title': f'Total Citations by {variable} and Citation Rank Percentile',
             'yaxis': {'title': variable}}
        ]
    )
    buttons.append(button)

# Create the figure with all traces and the dropdown
fig = go.Figure(data=traces)

fig.update_layout(
    updatemenus=[
        dict(
            type='dropdown',
            showactive=True,
            buttons=buttons,
            x=1.05,
            xanchor='left',
            y=1.1,
            yanchor='top'
        )
    ],
    title=f"Total Citations by {variables[0]} and Citation Rank Percentile",
    xaxis_title="Citation Rank Percentile",
    yaxis_title=variables[0]
)

fig.show()



In [None]:
fig.write_image(f'{path}/results/Paper/img/heatmap_citation_rank_{variable}.pdf')

In [7]:
citations_617['combined'] = citations_617['data_cat'] + ',' + citations_617['rq_cat'] + ',' + citations_617['category_1']
citations_617['combined'] = citations_617['combined'].astype('category')

In [None]:
citations_617['combined'] = (
    citations_617['data_cat'].astype(str) + ',' +
    citations_617['rq_cat'].astype(str) + ',' +
    citations_617['category_1'].astype(str)
)

combo_counts = (
    citations_617
    .groupby(['percentile_bin', 'combined'], observed=True)
    .size()
    .reset_index(name='count')
)

bin_totals = (
    citations_617
    .groupby('percentile_bin', observed=True)
    .size()
    .reset_index(name='bin_total')
)

combo_counts = combo_counts.merge(bin_totals, on='percentile_bin')
combo_counts['probability'] = combo_counts['count'] / combo_counts['bin_total']

# Select top-N combinations per bin based on true probability
top_n = 5
top_combos = (
    combo_counts
    .sort_values(['percentile_bin', 'probability'], ascending=[True, False])
    .groupby('percentile_bin', observed=True)
    .head(top_n)
)

heatmap_data = pd.pivot_table(
    top_combos,
    values='probability',
    index='combined',
    columns='percentile_bin',
    fill_value=0,
    observed=True
)


fig = go.Figure(
    go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns.astype(str),
        y=heatmap_data.index,
        colorscale='Viridis',
        showscale=True,
        zmin=0,
        # zmax=heatmap_data.values.max(),
        zmax=0.25,
        xgap=2,
        ygap=2,
        colorbar=dict(title='Probability')
    )
)

fig.update_layout(
    title=f"Top {top_n} Feature Combinations per Citation Percentile (True Probability)",
    xaxis_title="Citation Rank Percentile",
    yaxis_title="Feature Combination",
    # yaxis=dict(showticklabels=False)
)

fig.show()


In [31]:
combo_counts[(combo_counts['percentile_bin'] == 'P0-10') & (combo_counts['probability'] > 0)].sort_values('probability', ascending=False)

Unnamed: 0,percentile_bin,combined,count,bin_total,probability
17,P0-10,"Panel/Longitudinal Surveys,Empirical Estimates...",5,28,0.178571
12,P0-10,"No dataset,Empirical Estimates and Determinant...",4,28,0.142857
6,P0-10,"Linked Administrative Data,Empirical Estimates...",2,28,0.071429
11,P0-10,"National Survey Data,Intergenerational Wealth ...",2,28,0.071429
0,P0-10,"Administrative/Registry Data,Empirical Estimat...",1,28,0.035714
10,P0-10,"Linked Administrative Data,Policy, Institution...",1,28,0.035714
16,P0-10,"Panel/Longitudinal Surveys,Demographic Differe...",1,28,0.035714
15,P0-10,"Others,Empirical Estimates and Determinants,Ot...",1,28,0.035714
14,P0-10,"No dataset,Theoretical and Structural Models,D...",1,28,0.035714
13,P0-10,"No dataset,Intergenerational Wealth Mobility a...",1,28,0.035714


In [None]:
# fig.write_image(f'{path}/results/Paper/img/heatmap_top_combos_citation_rank.pdf')