A notebook to explore availability of data for LinkBERT style pretraining.

## Data an Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_pickle("DATASET/ED4RE_2503/ED4RE_2603.pickle")

In [None]:

# --- Step 1: Calculate the number of references for each document ---
def get_reference_count(ref_entry):
    if isinstance(ref_entry, list):
        return len(ref_entry)
    elif ref_entry == "no_references":
        return 0
    else: # This handles strings and other non-list types
        return 1

# Apply this function to create a new column with the counts
df['num_references'] = df['References'].apply(get_reference_count)


# --- Step 2: Get high-level statistics ---

# First, let's count how many are neatly structured vs. single blobs
is_list_mask = df['References'].apply(lambda x: isinstance(x, list))
list_count = is_list_mask.sum()
string_blob_count = len(df) - list_count

print("="*60)
print("Analysis of Reference Column Structure")
print("="*60)
print(f"Total documents with potential references: {len(df):,}")
print(f"Documents with references as a list (good format): {list_count:,} ({list_count/len(df):.2%})")
print(f"Documents with references as a single string (needs parsing): {string_blob_count:,} ({string_blob_count/len(df):.2%})")
print("\n")


# Now, let's get descriptive statistics on the 'num_references' column
print("="*60)
print("Descriptive Statistics for Number of References per Document")
print("="*60)
# The describe() output will be heavily influenced by the single-string blobs (value=1)
# so we'll show stats for both all data and just the list-formatted data.
print("--- Overall (including single-string blobs as 1) ---")
print(df['num_references'].describe())
print("\n")

# Filter for only the documents that had a list to get a cleaner distribution
df_lists_only = df[is_list_mask]
print("--- For List-Formatted References Only ---")
print(df_lists_only['num_references'].describe())
print("="*60)


# --- Step 3: Visualize the distribution ---

print("\nGenerating visualizations...")

# Set plot style
sns.set_theme(style="whitegrid")

# Plot 1: Histogram of all reference counts
plt.figure(figsize=(14, 7))
sns.histplot(data=df, x='num_references', bins=100) # Use a good number of bins
plt.title('Distribution of Reference Count (All Documents)', fontsize=16)
plt.xlabel('Number of References (Single Strings are count=1)', fontsize=12)
plt.ylabel('Number of Documents (Log Scale)', fontsize=12)
plt.yscale('log') # Use a log scale for the y-axis to see the full range
plt.show()

# Plot 2: Histogram focusing on the "good" data (lists only)
# This ignores the large spike at count=1 from string blobs
plt.figure(figsize=(14, 7))
sns.histplot(data=df_lists_only, x='num_references', bins=10, kde=True)
plt.title('Distribution of Reference Count (List-Formatted Data Only)', fontsize=16)
plt.xlabel('Number of References', fontsize=12)
plt.ylabel('Number of Documents', fontsize=12)
# No log scale here to see the shape of the "clean" data more clearly
# unless it's also highly skewed.
plt.xlim(left=0) # Start x-axis at 0
plt.show()

Total documents with potential references: 171,880
Documents with references as a list (good format): 162,726 (94.67%)
Documents with references as a single string (needs parsing): 9,154 (5.33%)

This