In [None]:
import os
import numpy as np
from pathlib import Path
from tqdm import tqdm
import pickle

In [None]:
dataset_path = Path("dataset_papers")
paper_folders = os.listdir(dataset_path)
parent_folders=paper_folders.sort()
print(len(paper_folders))

In [None]:
print(paper_folders[0])
files = os.listdir(dataset_path / paper_folders[0])
print(files)

In [None]:
with open(dataset_path / paper_folders[0] / "icml07.bbl", 'r') as f:
    content = f.read()
    print(content)

In [None]:
# def strip_and_remove_end_punctuation(text):
#     """
#     Remove end punctuation from the text.
#     """
#     text = text.strip()
#     if text.endswith('.'):
#         return text[:-1]
#     return text

In [None]:
# from cleantext import clean

# def clean_sentence(text):
#     return clean(
#         text,
#         fix_unicode=True,
#         to_ascii=True,
#         lower=False,
#         no_line_breaks=True,
#         no_urls=True,
#         no_emails=True,
#         no_phone_numbers=True,
#         no_numbers=False,
#         no_digits=False,
#         no_currency_symbols=True,
#         no_punct=True,
#         replace_with_punct="",
#         replace_with_url="",
#         replace_with_email="",
#         replace_with_phone_number="",
#         replace_with_number="",
#         replace_with_digit="",
#         replace_with_currency_symbol=""
#     )


In [None]:
import re

phrases_to_remove = [r"\\emph", r"\\em", r"\\n" r"\\textsc"]

def format(text):
    # Remove listed phrases
    for phrase in phrases_to_remove:
        text = re.sub(phrase, '', text)

    # Remove unwanted characters: {, }, (, ), \, $, #, @, %, ^, &, *, !, 
    text = re.sub(r'[{}()\\$#@%^&*!"`\']', '', text)
    
    # Remove end punctuation (., ?, !, ;, :, -, – or —)
    text = re.sub(r'[.?!;,:\-–—]+$', '', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    text = text.lower()
    text = text.strip()
    
    return text



def read_bbl_file(bbl_path):
    for encoding in ['utf-8', 'cp1252', 'latin-1']:
        try:
            with open(bbl_path, 'r', encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    with open(bbl_path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

def extract_bib_entries_from_bbl(bbl_path):
    content = read_bbl_file(bbl_path)

    bib_entries = re.findall(
        r'(\\bibitem(?:\[[^\]]*\]){0,3}\{([^}]+)\}.*?)(?=\\bibitem|\Z|\\end\{thebibliography\})',
        content,
        flags=re.DOTALL
    )

    entries = []

    for full_entry, key in bib_entries:
        body = re.sub(r'^\\bibitem(?:\[[^\]]*\]){0,3}\{[^}]+\}', '', full_entry).strip()
        parts = re.split(r'\\newblock', body)
        parts = [p.strip() for p in parts if p.strip()]

        if len(parts) >= 2:
            # entry = {
            #     'key': key,
            #     'authors': parts[0],
            #     'title': strip_and_remove_end_punctuation(parts[1]),
            #     'venue': parts[2] if len(parts) > 2 else None
            # }
            entry = format(parts[1])
            entries.append(entry)
        
        if len(parts) == 1:
            if '``' in parts[0] and "''" in parts[0]:
                try:
                    entry = parts[0].split('``')[1].split("''")[0]
                    entry = format(entry)
                    entries.append(entry)
                except IndexError:
                    pass
            else:
                # Fallback: Just format the only part available
                entry = format(parts[0])
                entries.append(entry)
        
        else:
            continue
            

    return entries


In [None]:
from pathlib import Path
# 2. Build the path to the target .bbl file:
bbl_file_path = dataset_path / paper_folders[1] / "example_paper.bbl"

# 3. Read and print:
content = read_bbl_file(bbl_file_path)
print(content)

In [None]:
extract_bib_entries_from_bbl(
    dataset_path / paper_folders[1] / "example_paper.bbl"
)

In [None]:
bib_files = []
papers_with_bib = []
papers_without_bib = []
for folder in paper_folders:
    paper_files = list(Path(dataset_path / folder).glob("*.bib"))
    bib_files += paper_files
    if len(paper_files) > 0:
        papers_with_bib.append(folder)
    else:
        papers_without_bib.append(folder)

In [None]:
print(len(papers_with_bib))
print(len(papers_without_bib))

In [None]:
paper_folders[100]

In [None]:
bbl_files = []
papers_with_bbl = []
papers_without_bbl = []
for folder in paper_folders:
    paper_files = list(Path(dataset_path / folder).glob("*.bbl"))
    bbl_files += paper_files
    if len(paper_files) > 0:
        papers_with_bbl.append(folder)
    else:
        papers_without_bbl.append(folder)

In [None]:
print(len(papers_with_bbl))
print(len(papers_without_bbl))

In [None]:
print(papers_without_bbl)

In [None]:
def clean_latex_text(text):
    # Remove LaTeX commands like \em, \textit, etc.
    cleaned_text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)
    # Remove newline characters and extra spaces
    cleaned_text = re.sub(r'\n+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

In [None]:
papers = []
for i, folder in tqdm(enumerate(paper_folders)):
    paper_code = folder
    bbl_files = list(Path(dataset_path / folder).glob("*.bbl"))
    citations = []
    for file in bbl_files:
        conference = str(file).split("/")[-1].split(".")[0]
        # print(conference)
        names_citations = extract_bib_entries_from_bbl(file)
        citations += [(conference, format(name)) for name in names_citations]
    with open(dataset_path / folder / "title.txt", 'r') as f:
        title = format(f.read())
    with open(dataset_path / folder / "abstract.txt", 'r') as f:
        abstract = f.read().strip()
    papers.append({
        'index': i,
        'paper_code': paper_code,
        'title': format(title),
        'abstract': abstract,
        'citations': citations
    })
    # print(citations)
    # print(title, "- done")



In [None]:
papers[0]

In [None]:
# graph = []
# for i in tqdm(range(len(papers))):
#     for citations in papers[i]['citations']:
#         citation = citations[1]
#         for paper in papers:
#             if paper['title'] == citation:
#                 if(paper['index'] == papers[i]['index']):
#                     continue
#                 edge = (papers[i]['index'], paper['index'])
#                 graph.append(edge)
#                 # print(edge)
#                 break

# with open("graph.pkl", 'wb') as f:
#     pickle.dump(graph, f)

In [None]:
with open("graph.pkl", 'rb') as f:
    graph = pickle.load(f)

In [None]:
print(len(graph)) # 727, 1646, 1676, 1648, 17810, 18208, 18213

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

In [None]:
len(papers)

In [None]:
G = nx.DiGraph()
G.add_nodes_from(range(len(papers)))
G.add_edges_from(graph)

In [None]:
plt.figure(figsize=(300, 200))
plt.title("Citation Graph")
plt.axis('off')
pos = nx.spring_layout(G, k=0.1, iterations=50)
nx.draw(
    G, pos,
    node_size=30,
    with_labels=True,
    font_size=10,
    font_color='yellow'  # ← Set font color to yellow
)
plt.savefig("citation_graph.png", dpi=100, bbox_inches='tight')

In [None]:
print("Number of nodes in the graph:", G.number_of_nodes())
print("Number of edges in the graph:", G.number_of_edges())
isolated_nodes = list(nx.isolates(G))
print("Number of isolated nodes in the graph:", len(isolated_nodes))

In [None]:
degrees = np.array([deg for _, deg in G.degree()])
in_degrees =  np.array([deg for _, deg in G.in_degree()])
out_degrees = np.array([deg for _, deg in G.out_degree()])

average_degree = degrees.mean()
average_in_degree = in_degrees.mean()
average_out_degree = out_degrees.mean()

print("Average degree (3 decimal places):", np.round(average_degree, 3))
print("Average in-degree (3 decimal places):", np.round(average_in_degree, 3))
print("Average out-degree (3 decimal places):", np.round(average_out_degree, 3))

In [None]:
#plt.figure(figsize=(10, 6))
#plt.hist(degrees, bins=100, color='skyblue', edgecolor='black')
#plt.title("Histogram of Node Degrees")
#plt.xlabel("Degree")
#plt.ylabel("Number of Nodes")
#plt.grid(True)
#plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(degrees, bins=100, color='skyblue', edgecolor='black')

plt.title("Histogram of Node Degrees", fontsize=16)
plt.xlabel("Degree", fontsize=14)
plt.ylabel("Number of Nodes", fontsize=14)
plt.grid(visible=True, which="both", linestyle='--', alpha=0.5)
sns.despine()
plt.tight_layout()
plt.show()






In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(in_degrees, bins=100, color='skyblue', edgecolor='black')

plt.title("Histogram of Node In-Degrees", fontsize=16)
plt.xlabel("In-Degree", fontsize=14)
plt.ylabel("Number of Nodes", fontsize=14)
plt.grid(visible=True, which="both", linestyle='--', alpha=0.5)
sns.despine()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(out_degrees, bins=100, color='skyblue', edgecolor='black')

plt.title("Histogram of Node Out-Degrees", fontsize=16)
plt.xlabel("Out-Degree", fontsize=14)
plt.ylabel("Number of Nodes", fontsize=14)
plt.grid(visible=True, which="both", linestyle='--', alpha=0.5)
sns.despine()
plt.tight_layout()
plt.show()


In [None]:
if not nx.is_strongly_connected(G):
    # Get largest strongly connected component
    largest_scc = max(nx.strongly_connected_components(G), key=len)
    G_sub = G.subgraph(largest_scc)

    # Compute diameter (requires undirected or fully strongly connected)
    diameter = nx.diameter(G_sub)
    print("Diameter of the largest strongly connected component:", diameter)
else:
    diameter = nx.diameter(G)
    print("Diameter of the graph:", diameter)

In [None]:
nx.number_of_selfloops(G) # should be 0

In [None]:
nx.write_graphml(G, "citation_graph.graphml")

In [None]:
with open("data/papers.pkl", 'wb') as f:
    pickle.dump(papers, f)
