# ============================================================
# SciND ‚Üí OpenAlex Matching + Knowledge Graph Construction
# PREMIUM (API KEY) VERSION ‚Äî SINGLE SCRIPT
# ============================================================

In [None]:
import os
import glob
import re
import time
import pandas as pd
import requests
from difflib import SequenceMatcher
import hashlib

# ============================================================
# CONFIGURATION
# ============================================================

# Make sure you set this BEFORE running:
# Windows (PowerShell): setx OPENALEX_API_KEY "YOUR_KEY"
# Linux/Mac: export OPENALEX_API_KEY="YOUR_KEY"
# ============================================================
# MANUALLY SPECIFY YOUR OPENALEX PREMIUM API KEY HERE
# ============================================================

In [None]:
API_KEY = "b2opHjuXMNQJy9aTaB4bZF"

if not API_KEY or API_KEY != "b2opHjuXMNQJy9aTaB4bZF":
    raise ValueError("‚ùå Please paste your OpenAlex API key into API_KEY.")

REQUEST_DELAY = 0.02   # ~50 req/sec (safe for Premium)
CHECKPOINT_FILE = "../openalex_matches_checkpoint.csv"

# ============================================================
# PART 1: Load SciND Papers and Extract Titles
# ============================================================

In [None]:
print("üìö Loading SciND papers...")
triplet_files = glob.glob("*/Triplets/**/*.csv", recursive=True)

mapping_records = []

for file in triplet_files:
    df = pd.read_csv(file)

    if "SKG" in file:
        split = "SKG"
        stanza_split = "SKG_Papers"
    elif "Novel_Papers" in file:
        split = "NOVEL"
        stanza_split = "Novel_Papers"
    elif "Blogs" in file:
        split = "BLOG"
        stanza_split = "Blogs"
    else:
        continue

    filename = os.path.basename(file)
    domain_raw = filename.replace("_triplets.csv", "")
    domain = domain_raw.replace("_Blogs", "") if split == "BLOG" else domain_raw

    paper_ids = sorted(df["paper_ID"].unique())

    pattern = f"*/stanza_files/{stanza_split}/{domain}/*-Stanza-out.txt"
    directory = glob.glob(pattern)

    for pid in paper_ids:
        if pid < len(directory):
            filepath = directory[pid]
            fname = os.path.basename(filepath)

            title = fname.replace("-Stanza-out.txt", "")
            title = re.sub(r"^\d+\.", "", title).strip()

            mapping_records.append({
                "split": split,
                "domain": domain,
                "paper_ID": pid,
                "title": title
            })

paper_mapping = pd.DataFrame(mapping_records)

paper_mapping = paper_mapping[
    paper_mapping["split"].isin(["SKG", "NOVEL"])
].reset_index(drop=True)

print(f"‚úÖ Loaded {len(paper_mapping)} scholarly papers")

# ============================================================
# PART 2: Premium OpenAlex Client (Authenticated Pool)
# ============================================================

In [None]:
class OpenAlexClient:
    def __init__(self, api_key, delay=0.02):
        self.session = requests.Session()
        self.session.headers.update({
            "Authorization": f"Bearer {api_key}",
            "User-Agent": "ScientificNoveltyProject/1.0"
        })

        self.delay = delay
        self.last_request = 0

        self.stats = {
            "total": 0,
            "success": 0,
            "rate_limited": 0,
            "not_found": 0
        }

    def _wait(self):
        elapsed = time.time() - self.last_request
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)

    def similarity(self, a, b):
        return SequenceMatcher(None, a.lower(), b.lower()).ratio()

    def search_paper(self, title):
        self.stats["total"] += 1
        self._wait()

        params = {
            "filter": f'title.search:"{title}"',
            "sort": "relevance_score:desc",
            "per-page": 5,
            "select": "id,display_name,publication_year,cited_by_count,referenced_works"
        }

        try:
            r = self.session.get("https://api.openalex.org/works", params=params, timeout=15)
            self.last_request = time.time()

            if r.status_code == 200:
                results = r.json()["results"]

                best_match = None
                best_score = 0

                for paper in results:
                    score = self.similarity(title, paper["display_name"])
                    if score > best_score and score > 0.6:
                        best_match = paper
                        best_score = score

                if best_match:
                    self.stats["success"] += 1
                    return {
                        "openalex_id": best_match["id"],
                        "publication_year": best_match["publication_year"],
                        "citation_count": best_match["cited_by_count"],
                        "referenced_works": best_match.get("referenced_works", [])
                    }

                self.stats["not_found"] += 1
                return None

            elif r.status_code == 429:
                self.stats["rate_limited"] += 1
                wait = 1.5 ** min(self.stats["rate_limited"], 6)
                print(f"‚ö†Ô∏è Rate limited. Backoff {wait:.2f}s")
                time.sleep(wait)
                return self.search_paper(title)

            else:
                self.stats["not_found"] += 1
                return None

        except Exception as e:
            print(f"‚ùå Error: {e}")
            self.stats["not_found"] += 1
            return None

    def batch_search(self, papers_df):
        results = []

        if os.path.exists(CHECKPOINT_FILE):
            existing = pd.read_csv(CHECKPOINT_FILE)
            results = existing.to_dict("records")
            processed = set(existing["paper_ID"])
            print(f"üì• Loaded checkpoint with {len(processed)} papers")
        else:
            processed = set()

        for idx, row in papers_df.iterrows():
            if row["paper_ID"] in processed:
                continue

            print(f"üîç {idx+1}/{len(papers_df)} {row['title'][:60]}")

            match = self.search_paper(row["title"])

            result = {**row.to_dict(), **(match or {
                "openalex_id": None,
                "publication_year": None,
                "citation_count": None,
                "referenced_works": None
            })}

            results.append(result)

            if idx % 50 == 0:
                pd.DataFrame(results).to_csv(CHECKPOINT_FILE, index=False)

        df = pd.DataFrame(results)
        df.to_csv(CHECKPOINT_FILE, index=False)
        return df

# ============================================================
# PART 3: Run Matching
# ============================================================


In [None]:
client = OpenAlexClient(API_KEY, REQUEST_DELAY)
paper_metadata = client.batch_search(paper_mapping)

matched_papers = paper_metadata[paper_metadata["openalex_id"].notna()].copy()

print(f"‚úÖ Match rate: {len(matched_papers)}/{len(paper_mapping)}")

# ============================================================
# PART 4: Build Knowledge Graph
# ============================================================

In [None]:
print("üèóÔ∏è Building Knowledge Graph...")

nodes = []

# Paper Nodes
for _, row in matched_papers.iterrows():
    nodes.append({
        "node_id": f"P_{row['openalex_id'].split('/')[-1]}",
        "node_type": "Paper",
        "year": row["publication_year"],
        "domain": row["domain"],
        "split": row["split"],
        "name": None
    })

# Load all triples
triples_df = pd.concat([pd.read_csv(f) for f in triplet_files], ignore_index=True)
triples_df = triples_df[triples_df["paper_ID"].isin(set(matched_papers["paper_ID"]))]

# Entity Nodes
entities = set(triples_df.iloc[:, 2].astype(str))

entity_lookup = {}
for ent in entities:
    eid = "E_" + hashlib.md5(ent.encode()).hexdigest()[:8]
    entity_lookup[ent] = eid
    nodes.append({
        "node_id": eid,
        "node_type": "Entity",
        "year": None,
        "domain": None,
        "split": None,
        "name": ent
    })

nodes_df = pd.DataFrame(nodes)
nodes_df.to_csv("nodes.csv", index=False)

# Knowledge Edges
knowledge_edges = []
paper_lookup = {
    r.paper_ID: f"P_{r.openalex_id.split('/')[-1]}"
    for r in matched_papers.itertuples()
}

for row in triples_df.itertuples():
    entity = str(row[3])
    if entity in entity_lookup:
        knowledge_edges.append({
            "source": paper_lookup[row.paper_ID],
            "target": entity_lookup[entity],
            "predicate": str(row[2])
        })

pd.DataFrame(knowledge_edges).to_csv("../knowledge_edges.csv", index=False)

# Citation Edges
citation_edges = []

for _, row in matched_papers.iterrows():
    src = f"P_{row['openalex_id'].split('/')[-1]}"
    refs = row["referenced_works"]

    if isinstance(refs, list):
        for ref in refs:
            citation_edges.append({
                "source": src,
                "target": f"P_{ref.split('/')[-1]}",
                "year": row["publication_year"]
            })

pd.DataFrame(citation_edges).to_csv("citation_edges.csv", index=False)

print("‚úÖ Knowledge Graph Built Successfully")
print(f"Nodes: {len(nodes_df)}")
print(f"Knowledge Edges: {len(knowledge_edges)}")
print(f"Citation Edges: {len(citation_edges)}")

In [81]:
a=pd.read_csv("../knowledge_edges.csv")
b=pd.read_csv("citation_edges.csv")

In [84]:
a

Unnamed: 0,source,target,predicate
0,P_W2986265153,E_37693cfc,0
1,P_W2986265153,E_37693cfc,0
2,P_W2986265153,E_37693cfc,0
3,P_W2986265153,E_093f65e0,0
4,P_W2986265153,E_093f65e0,0
...,...,...,...
235720,P_W3115997577,E_f7e6c855,27
235721,P_W3115997577,E_bf822969,27
235722,P_W3115997577,E_bf822969,27
235723,P_W3115997577,E_bf822969,27


In [83]:
b

Unnamed: 0,source,target,year
0,P_W2971141904,P_W11511616,2019.0
1,P_W2971141904,P_W22168010,2019.0
2,P_W2971141904,P_W630532510,2019.0
3,P_W2971141904,P_W2102423300,2019.0
4,P_W2971141904,P_W2120699290,2019.0
...,...,...,...
10913,P_W2251292973,P_W2998704965,2015.0
10914,P_W2251292973,P_W3104097132,2015.0
10915,P_W2251292973,P_W4285719527,2015.0
10916,P_W2251292973,P_W4294170691,2015.0


In [85]:
c=pd.read_csv("nodes.csv")

In [86]:
c

Unnamed: 0,node_id,node_type,year,domain,split,name
0,P_W3173691672,Paper,2021.0,Dia2021,NOVEL,
1,P_W3176450677,Paper,2021.0,Dia2021,NOVEL,
2,P_W3171266972,Paper,2021.0,Dia2021,NOVEL,
3,P_W3166143260,Paper,2021.0,Dia2021,NOVEL,
4,P_W3108508534,Paper,2021.0,Dia2021,NOVEL,
...,...,...,...,...,...,...
2707,E_9b72e31d,Entity,,,,540.0
2708,E_285e19f2,Entity,,,,503.0
2709,E_8e6b42f1,Entity,,,,471.0
2710,E_5878a7ab,Entity,,,,167.0


In [90]:
c["year"]>=2022

0       False
1       False
2       False
3       False
4       False
        ...  
2707    False
2708    False
2709    False
2710    False
2711    False
Name: year, Length: 2712, dtype: bool