In [1]:

import os
import json
from openai import OpenAI
from dotenv import load_dotenv



load_dotenv("api_key.env")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

acm_ccs_structure = [
    {
        "high_level_domain": "General and reference",
        "subdomains": [
            {
                "subdomain": "Document types",
                "examples": ["Surveys and overviews", "Reference works", "General conference proceedings", "Biographies", "General literature", "Computing standards, RFCs and guidelines"
                ]
            },
            {
                "subdomain": "Cross-computing tools and techniques",
                "examples": ["Reliability", "Empirical studies", "Measurement", "Metrics","Evaluation", "Experimentation", "Estimation", "Design", "Performance", "Validation", "Verification"
                ]
            }
        ]
    },
    {
        "high_level_domain": "Computer systems organization",
        "subdomains": [
            {"subdomain": "Architectures", "examples": ["Serial architectures", "Parallel architectures", "Distributed architectures", "Other architectures"]},
            {"subdomain": "Embedded and cyber-physical systems", "examples": ["Sensor networks", "Robotics", "Sensors and actuators", "System on a chip", "Embedded systems"]},
            {"subdomain": "Real-time systems", "examples": ["Real-time operating systems", "Real-time languages", "Real-time system specification", "Real-time system architecture"]},
            {"subdomain": "Dependable and fault-tolerant systems and networks", "examples": ["Reliability", "Availability", "Maintainability and maintenance", "Processors and memory architectures", "Secondary storage organization", "Redundancy", "Fault-tolerant network topologies"]}
        ]
    },
    {
        "high_level_domain": "Hardware",
        "subdomains": [
            {"subdomain": "Printed circuit boards", "examples": ["Electromagnetic interference and compatibility", "PCB design and layout"]},
            {"subdomain": "Communication hardware, interfaces and storage", "examples": ["Signal processing systems", "Sensors and actuators", "Buses and high-speed links", "Displays and imagers", "External storage", "Networking hardware", "Printers", "Sensor applications and deployments", "Sensor devices and platforms", "Sound-based input / output", "Tactile and hand-based interfaces", "Scanners", "Wireless devices", "Wireless integrated network sensors", "Electro-mechanical devices" ]},
            {"subdomain": "Very large scale integration design", "examples": ["3D integrated circuits", "Analog and mixed-signal circuits", "Application-specific VLSI designs", "Design reuse and communication-based design", "Design rules", "Economics of chip design and manufacturing", "Full-custom circuits", "VLSI design manufacturing considerations", "On-chip resource management", "On-chip sensors", "Standard cell libraries", "VLSI packaging", "VLSI system specification and constraints"]},
            {"subdomain": "Power and energy", "examples": ["Thermal issues", "Energy generation and storage", "Energy distribution", "Impact on the environment", "Power estimation and optimization"]},
            {"subdomain": "Hardware validation", "examples": ["Functional verification", "Physical verification", "Post-manufacture validation and debug"]},
            {"subdomain": "Hardware test", "examples": ["Analog, mixed-signal and radio frequency test", "Physical verification", "Post-manufacture validation and debug"]},
            {"subdomain": "Electronic design automation", "examples": ["High-level and register-transfer level synthesis", "Hardware description languages and compilation", "Logic synthesis", "Modeling and parameter extraction", "Physical design (EDA)", "Timing analysis", "Methodologies for EDA"]},
            {"subdomain": "Robustness", "examples": ["Fault tolerance", "Design for manufacturability", "Hardware reliability", "Safety critical systems"]},
            {"subdomain": "Emerging technologies", "examples": ["Analysis and design of emerging devices and systems", "Hardware description languages and compilation", "Biology-related information processing", "Circuit substrates", "Electromechanical systems", "Emerging interfaces", "Memory and dense storage", "Emerging optical and photonic technologies", "Reversible logic", "Plasmonics", "Quantum technologies", "Spintronics and magnetic technologies"]}

        ]
    },
    {
        "high_level_domain": "Networks",
        "subdomains": [
            {"subdomain": "Network architectures", "examples": ["Network design principles", "Programming interfaces"]},
            {"subdomain": "Network protocols", "examples": ["Network protocol design", "Protocol correctness", "Link-layer protocols", "Network layer protocols", "Transport protocols", "Session protocols", "Presentation protocols", "Application layer protocols", "OAM protocols", "Cross-layer protocols", "Network File System (NFS) protocol"]},
            {"subdomain": "Network components", "examples": ["Intermediate nodes", "Physical links", "Middle boxes / network appliances", "End nodes", "Wireless access points, base stations and infrastructure", "Logical nodes"]},
            {"subdomain": "Network algorithms", "examples": ["Data path algorithms", "Control path algorithms", "Network economics"]},
            {"subdomain": "Network performance evaluation", "examples": ["Network performance modeling", "Network simulations", "Network experimentation", "Network performance analysis", "Network measurement"]},
            {"subdomain": "Network properties", "examples": ["Network security", "Network range", "Network structure", "Network dynamics", "Network reliability", "Network mobility", "Network manageability", "Network privacy and anonymity"]},
            {"subdomain": "Network services", "examples": ["Naming and addressing", "Cloud computing", "Location based services", "Programmable networks", "In-network processing", "Network management", "Network monitoring"]},
            {"subdomain": "Network types", "examples": ["Network on chip", "Home networks", "Storage area networks", "Data center networks", "Wired access networks", "Cyber-physical networks", "Mobile networks", "Overlay and other logical network structures", "Wireless access networks", "Ad hoc networks", "Public Internet", "Packet-switching networks"]}
        ]
    },
    {
        "high_level_domain": "Software and its engineering",
        "subdomains": [
            {"subdomain": "Software organization and properties", "examples": ["Contextual software domains", "Software system structures", "Software functional properties", "Extra-functional properties"]},
            {"subdomain": "Software notations and tools", "examples": ["General programming languages", "Formal language definitions", "Compilers", "Context specific languages", "System description languages", "Development frameworks and environments", "Software configuration management and version control systems", "Software libraries and repositories", "Software maintenance tools"]},
            {"subdomain": "Software creation and management", "examples": ["Designing software", "Software development process management", "Software development techniques", "Software verification and validation", "Software post-development issues", "Collaboration in software development", "Search-based software engineering"]}
        ]
    },
    {
        "high_level_domain": "Information systems",
        "subdomains": [
            {"subdomain": "Data management systems", "examples": ["Database design and models", "Data structures", "Database management system engines", "Query languages", "Database administration", "Information integration", "Middleware for databases"]},
            {"subdomain": "Information storage systems", "examples": ["Information storage technologies", "Record storage systems", "Storage replication", "Storage architectures", "Storage management"]},
            {"subdomain": "Information systems applications", "examples": ["Enterprise information systems", "Collaborative and social computing systems and tools", "Spatial-temporal systems", "Decision support systems", "Mobile information processing systems", "Process control systems", "Multimedia information systems", "Data mining", "Digital libraries and archives", "Computational advertising", "Computing platforms"]},
            {"subdomain": "World Wide Web", "examples": ["Web searching and information discovery", "Online advertising", "Web mining", "Web applications", "Web interfaces", "Web services", "Web data description languages"]},
            {"subdomain": "Information retrieval", "examples": ["Document representation", "Information retrieval query processing", "Users and interactive retrieval", "Retrieval models and ranking", "Retrieval tasks and goals", "Evaluation of retrieval results", "Search engine architectures and scalability", "Specialized information retrieval"]}
        ]
    },
    {
        "high_level_domain": "Security and privacy",
        "subdomains": [
            {"subdomain": "Cryptography", "examples": ["Key management", "Public key (asymmetric) techniques", "Symmetric cryptography and hash functions", "Cryptanalysis and other attacks", 
            "Information-theoretic techniques", "Mathematical foundations of cryptography"]},
            {"subdomain": "Formal methods and theory of security", "examples": ["Trust frameworks", "Security requirements", "Formal security models", "Logic and verification"]},
            {"subdomain": "Security services", "examples": ["Authentication", "Access control", "Pseudonymity, anonymity and untraceability", "Privacy-preserving protocols", 
            "Digital rights management", "Authorization"]},
            {"subdomain": "Intrusion/anomaly detection and malware mitigation", "examples": ["Malware and its mitigation", "Intrusion detection systems", "Social engineering attacks"]},
            {"subdomain": "Security in hardware", "examples": ["Tamper-proof and tamper-resistant designs", "Embedded systems security", "Hardware security implementation", 
            "Hardware attacks and countermeasures", "Hardware reverse engineering"]},
            {"subdomain": "Systems security", "examples": ["Operating systems security", "Browser security", "Distributed systems security", "Information flow control", 
            "Denial-of-service attacks", "Firewalls", "Vulnerability management", "File system security"]},
            {"subdomain": "Network security", "examples": ["Security protocols", "Web protocol security", "Mobile and wireless security", "Denial-of-service attacks", "Firewalls"]},
            {"subdomain": "Database and storage security", "examples": ["Data anonymization and sanitization", "Management and querying of encrypted data", 
            "Information accountability and usage control", "Database activity monitoring"]},
            {"subdomain": "Software and application security", "examples": ["Software security engineering", "Web application security", "Social network security and privacy",
            "Domain-specific security and privacy architectures", "Software reverse engineering"]},
            {"subdomain": "Human and societal aspects of security and privacy", "examples": ["Economics of security and privacy", "Social aspects of security and privacy",
            "Privacy protections", "Usability in security and privacy"]}
        ]
    },
    {
        "high_level_domain": "Human-centered computing",
        "subdomains": [
            {"subdomain": "Human computer interaction (HCI)", "examples": ["HCI design and evaluation methods", "Interaction paradigms", "Interaction devices", "HCI theory, concepts and models", "Interaction techniques", "Interactive systems and tools", "Empirical studies in HCI"]},
            {"subdomain": "Interaction design", "examples": ["Interaction design process and methods", "Interaction design theory, concepts and paradigms", "Empirical studies in interaction design", "Systems and tools for interaction design"]},
            {"subdomain": "Collaborative and social computing", "examples": ["Collaborative and social computing theory, concepts and paradigms", "Collaborative and social computing design and evaluation methods", "Collaborative and social computing systems and tools", "Empirical studies in collaborative and social computing", "Collaborative and social computing devices"]},
            {"subdomain": "Ubiquitous and mobile computing", "examples": ["Ubiquitous and mobile computing theory, concepts and paradigms", "Ubiquitous and mobile computing systems and tools", "Ubiquitous and mobile devices", "Ubiquitous and mobile computing design and evaluation methods", "Empirical studies in ubiquitous and mobile computing"]},
            {"subdomain": "Visualization", "examples": ["Visualization techniques", "Visualization application domains", "Visualization systems and tools", "Visualization theory, concepts and paradigms", "Empirical studies in visualization", "Visualization design and evaluation methods"]},
            {"subdomain": "Accessibility", "examples": ["Accessibility theory, concepts and paradigms", "Empirical studies in accessibility", "Accessibility design and evaluation methods", "Accessibility technologies", "Accessibility systems and tools"]}
        ]
    },
    {
        "high_level_domain": "Applied computing",
        "subdomains": [
            {"subdomain": "Electronic commerce", "examples": ["Digital cash", "E-commerce infrastructure", "Electronic data interchange", "Electronic funds transfer", "Online shopping", "Online banking", "Secure online transactions", "Online auctions"]},
            {"subdomain": "Enterprise computing", "examples": ["Enterprise information systems", "Business process management", "Enterprise architectures", "Service-oriented architectures", "Event-driven architectures", "Business rules", "Enterprise modeling", "Enterprise ontologies, taxonomies and vocabularies", "Enterprise data management", "Reference models", "Business-IT alignment", "IT architectures", "IT governance", "Enterprise computing infrastructures", "Enterprise interoperability"]},
            {"subdomain": "Physical sciences and engineering", "examples":["Aerospace", "Archaeology", "Astronomy", "Chemistry", "Earth and atmospheric sciences", "Engineering", "Physics"]},
            {"subdomain": "Life and medical sciences", "examples": ["Computational biology", "Genomics", "Systems biology", "Consumer health", "Health care information systems", "Health informatics", "Bioinformatics", "Metabolomics / metabonomics", "Genetics"]},
            {"subdomain": "Law, social and behavioral sciences", "examples": ["Anthropology", "Law", "Psychology", "Economics", "Sociology"]},
            {"subdomain": "Computer forensics", "examples": ["Surveillance mechanisms", "Investigation techniques", "Evidence collection, storage and analysis", "Network forensics", "System forensics", "Data recovery"]},
            {"subdomain": "Arts and humanities", "examples": ["Fine arts", "Performing arts", "Architecture (buildings)", "Language translation", "Media arts", "Sound and music computing"]},
            {"subdomain": "Computers in other domains", "examples": ["Digital libraries and archives", "Publishing", "Military", "Cartography", "Agriculture", "Computing in government", "Personal computers and PC applications"]},
            {"subdomain": "Operations research", "examples": ["Consumer products", "Industry and manufacturing", "Computer-aided manufacturing", "Decision analysis", "Transportation", "Forecasting", "Marketing"]},
            {"subdomain": "Education", "examples": ["Digital libraries and archives", "Computer-assisted instruction", "Interactive learning environments", "Collaborative learning", "Learning management systems", "Distance learning", "E-learning", "Computer-managed instruction"]},
            {"subdomain": "Document management and text processing", "examples": ["Document searching", "Document management", "Document capture", "Document preparation"]},

        ]
    },
    {
        "high_level_domain": "Mathematics of computing",
        "subdomains": [
            {"subdomain": "Discrete mathematics", "examples": ["Combinatorics", "Graph theory"]},
            {"subdomain": "Probability and statistics", "examples": ["Probabilistic representations","Probabilistic inference problems","Probabilistic reasoning algorithms", "Probabilistic algorithms", "Statistical paradigms", "Stochastic processes", "Nonparametric statistics", "Distribution functions", "Multivariate statistics"]},
            {"subdomain": "Mathematical software", "examples": ["Solvers", "Statistical software", "Mathematical software performance"]},
            {"subdomain": "Information theory","examples": ["Coding theory"]},
            {"subdomain": "Mathematical analysis", "examples": ["Numerical analysis", "Mathematical optimization", "Differential equations", "Calculus", "Functional analysis", "Integral equations", "Nonlinear equations", "Quadrature"]},
            {"subdomain": "Continuous mathematics", "examples": ["Calculus", "Topology", "Continuous functions"]}
        ]
    },
    {
        "high_level_domain": "Theory of computation",
        "subdomains": [
            {"subdomain": "Models of computation", "examples": ["Computability", "Probabilistic computation", "Quantum computation theory", "Interactive computation", "Streaming models", "Concurrency", "Timed and hybrid models", "Abstract machines"]},
            {"subdomain": "Formal languages and automata theory", "examples": ["Formalisms", "Automata over infinite objects", "Grammars and context-free languages", "Tree languages", "Automata extensions", "Regular languages"]},
            {"subdomain": "Computational complexity and cryptoraphy", "examples": ["Problems, reductions and completeness", "Communication complexity", "Circuit complexity", "Oracles and decision trees", "Algebraic complexity theory", "Quantum complexity theory", "Proof complexity", "Interactive proof systems", "Complexity theory and logic", "Cryptographic primitives", "Cryptographic protocols"]},
            {"subdomain": "Logic", "examples":["Logic and verification", "Proof theory", "Modal and temporal logics", "Automated reasoning", "Constraint and logic programming", "Constructive mathematics", "Description logics", "Equational logic and rewriting", "Finite Model Theory", "Higher order logic", "Linear logic", "Programming logic", "Abstraction", "Verification by model checking", "Type theory", "Hoare logic", "Separation logic"]},
            {"subdomain": "Design and analysis of algorithms", "examples": ["Graph algorithms analysis", "Approximation algorithms analysis", "Mathematical optimization", "Data structures design and analysis", "Online algorithms", "Parameterized complexity and exact algorithms", "Streaming, sublinear and near linear time algorithms", "Parallel algorithms", "Distributed algorithms", "Algorithm design techniques", "Concurrent algorithms"]},
            {"subdomain": "Randomness, geometry and discrete structures", "examples": ["Pseudorandomness and derandomization", "Computational geometry", "Generating random combinatorial structures", "Random walks and Markov chains", "Expander graphs and randomness extractors", "Error-correcting codes", "Random projections and metric embeddings", "Random network models", "Random search heuristics"]},
            {"subdomain": "Semantics and reasoning", "examples": ["Program constructs", "Program semantics", "Program reasoning"]}

        ]
    },
    {
        "high_level_domain": "Computing methodologies",
        "subdomains": [
            {"subdomain": "Symbolic and algebraic manipulation", "examples": ["Symbolic and algebraic algorithms", "Computer algebra systems", "Representation of mathematical objects"]},
            {"subdomain": "Parallel computing methodologies", "examples": ["Parallel algorithms", "Parallel programming languages"]},
            {"subdomain": "Artificial intelligence", "examples": ["Natural language processing", "Knowledge representation and reasoning", "Planning and scheduling", "Search methodologies", "Control methods", "Philosophical/theoretical foundations of artificial intelligence", "Distributed artificial intelligence", "Computer vision"]},
            {"subdomain": "Machine learning", "examples": ["Learning paradigms", "Learning settings", "Machine learning approaches", "Machine learning algorithms", "Cross-validation"]},
            {"subdomain": "Modeling and simulation", "examples": ["Model development and analysis", "Simulation theory", "Simulation types and techniques", "Simulation support systems", "Simulation evaluation"]},
            {"subdomain": "Computer graphics", "examples": ["Animation", "Rendering", "Image manipulation", "Graphics systems and interfaces", "Image compression", "Shape modeling"]},
            {"subdomain": "Distributed computing methodologies", "examples": ["Distributed algorithms", "Distributed programming languages"]},
            {"subdomain": "Concurrent computing methodologies", "examples": ["Concurrent programming languages", "Concurrent algorithms"]}
        ]
    },
    {
    
        "high_level_domain": "Social and professional topics",
        "subdomains": [
            {"subdomain": "Professional topics", "examples": ["Computing industry", "Management of computing and information systems", "History of computing", "Computing education", "Computing and business", "Computing profession"]},
            {"subdomain": "Computing / technology policy", "examples": ["Intellectual property", "Privacy policies", "Censorship", "Surveillance", "Commerce policy", "Network access control", "Computer crime", "Government technology policy", "Medical information policy"]},
            {"subdomain": "User characteristics", "examples": ["Race and ethnicity", "Religious orientation", "Gender", "Sexual orientation", "People with disabilities", "Geographic characteristics", "Cultural characteristics", "Age"]}
        ]
    }
]

acm_json = json.dumps(acm_ccs_structure, indent=4)


In [2]:
def build_sub_subdomain_map(acm_structure):
    mapping = {}
    for domain in acm_structure:
        high_domain = domain["high_level_domain"]
        for sub in domain["subdomains"]:
            subdomain = sub["subdomain"]
            for example in sub.get("examples", []):
                mapping[example.lower()] = {"high_level_domain": high_domain, "subdomain": subdomain}
    return mapping


In [10]:
def generate_system_prompt(paper, task):
    title = paper['title']
    content = paper['content']

    if task == "title":
        return f"""
        You are tasked with extracting the title of the provided cybersecurity paper.

        Guidelines:
        \t1. The title is often at the top of the first page.
        \t2. Extract the title in its entirety.

        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your response must be returned in the following JSON format:
        {{
            "title": "Title of the paper here"
        }}

        Your response: """
    
    elif task == "conference_name":
         
         return f"""
        You are tasked with extracting the conference name where the paper was presented.
        
        Guidelines:
        \t1. The conference name is usually found at the top or bottom of the first page.
        \t2. Use the short form (USS, NDSS, ACSAC, SP, CCS) if applicable.
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "conference": "Short form of conference name (USS, NDSS, ACSAC, SP, CCS)"
        }}

        Your response: """

    elif task == "published_year":
        return f"""
        You are tasked with extracting the year of publication from the provided cybersecurity paper.
        
        Guidelines:
        \t1. The year of publication is usually found near the conference name or at the bottom of the first page.
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "year": "Year of publication here"
        }}
        
        Your response: """

   

    elif task == "domain":
        acm_json = json.dumps(acm_ccs_structure, indent=4)

        subdomain_map = {}
        for domain in acm_ccs_structure:
            for sub in domain["subdomains"]:
                for example in sub.get("examples", []):
                    subdomain_map[example] = {
                        "high_level_domain": domain["high_level_domain"],
                        "subdomain": sub["subdomain"]
                    }
        flat_map_json = json.dumps(subdomain_map, indent=4)

        return f"""
        You are tasked with identifying the **ACM Computing Classification System (CCS)** research domains for the following cybersecurity paper titled: \"{title}\".

        Your job is to return the correct **\"high_level_domain\"** and its corresponding **\"subdomain\"**, based on the paper’s content and the ACM CCS structure
        provided below.

         Guideline:
         t1. **Strictly follow the ACM CCS structure** below.
         \t- For each domain, you MUST use:
         \t- **high_level_domain**: One of the official ACM categories (e.g., "Security and privacy").
         \t- **subdomain**: Choose ONLY from the subdomains under the high-level domain as defined in the ACM structure.
         
         t2. Never invent sub-subdomains. The ACM CCS structure you are given ends at the subdomain level.
         \t- Even if the paper mentions a sub-subdomain (like "Usability in security and privacy"), your job is to **map it to the correct subdomain**
         from the ACM structure.
         \t- Use the mapping guide provided below under <SUB_SUBDOMAIN_MAPPING> to assist with this mapping task.
        
         t3. You are allowed to return **multiple (high_level_domain, subdomain)** pairs if the paper covers more than one domain.
         t4. Special Rule for NDSS / USENIX Security / IEEE S&P:
         \t- If the conference is **NDSS**, **USENIX Security**, or **IEEE S&P**, always treat the main high-level domain as "Security and privacy" (if relevant)
         \t  and identify the correct **subdomain** based on the paper content (typically "Network security" or "Systems security", etc.).

         t5. Special Rule for ACM CCS and ACSAC:
         \t- If the venue is **ACM CCS** or **ACSAC**, you should primarily determine the domain from the **"CCS Concepts"** section that appears on the first page.
         \t- Read the CCS Concepts block and map the listed CCS terms to the closest (high_level_domain, subdomain) pairs in the ACM CCS structure.
         \t- Only fall back to scanning the rest of the paper if the CCS Concepts section is missing or too generic to classify.
        
         Json Output Examples:
         Example #1 : For the paper titled **"(Un)informed Consent: Studying GDPR Consent Notices in the Field"** with the following CCS concepts:

         • Security and privacy → Usability in security and privacy
         • Human-centered computing → Empirical studies in interaction design
         • Social and professional topics → Government technology policy

         You should respond with:

         ```json
         [
           {{
             "high_level_domain": "Security and privacy",
             "subdomain": "Human and societal aspects of security and privacy"
           }},
           {{
             "high_level_domain": "Human-centered computing",
             "subdomain": "Interaction design"
           }},
           {{
             "high_level_domain": "Social and professional topics",
             "subdomain": "Computing / technology policy"
           }}
         ]
         ```

         Here is the full ACM CCS structure:
         <ACM_CCS_START>
         {acm_json}
         <ACM_CCS_END>

         Sub-subdomain mapping to subdomains:
         <SUB_SUBDOMAIN_MAPPING>
         {flat_map_json}
         <SUB_SUBDOMAIN_MAPPING_END>

         Start of Paper Content:
         {content}
         End of Paper Content:

         Your response:"""
        
        return f"Error: Task '{task}' is not supported"

In [4]:
def load_papers_from_jsonl(filepath):
    papers = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            papers.append(json.loads(line.strip()))
    return papers

def save_incremental_results(results, output_file="acm_results_incremental.jsonl"):
    with open(output_file, "a", encoding="utf-8") as file:
        for paper_title, result in results.items():
            file.write(json.dumps({paper_title: result}) + "\n")

def load_saved_results(output_file="acm_results_incremental.jsonl"):
    saved_titles = set()
    saved_results = {}
    try:
        with open(output_file, "r", encoding="utf-8") as file:
            for line in file:
                result = json.loads(line.strip())
                for title, data in result.items():
                    saved_titles.add(title)
                    saved_results[title] = data
    except FileNotFoundError:
        print("No previous results found.")
    return saved_titles, saved_results


In [5]:
def process_acm_domain_task(
    papers,
    start_index=0,
    output_file="acm_results_incremental.jsonl"
):
    """
    Runs four tasks per paper using GPT-5.1 models:
      - title
      - conference_name
      - published_year
      - domain

    Uses incremental saving so you can resume.
    """
    task_results = {}

    # Load already processed titles from previous runs
    processed_titles, _ = load_saved_results(output_file)

    # You can tune models per task if you want
    TASK_MODELS = {
        "title": "gpt-4o-mini",
        "conference_name": "gpt-4o-mini",
        "published_year": "gpt-4o-mini",
        "domain": "gpt-5.1",   # domain is a bit harder → use full model
    }

    tasks = ["title", "conference_name", "published_year", "domain"]

    for i, paper in enumerate(papers, start=start_index):
        title = paper.get("title", "").strip()
        if not title:
            print(f"Skipping paper {i + 1}: missing title")
            continue

        if title in processed_titles:
            print(f"Skipping already processed paper: {title}")
            continue

        print(f"Processing paper {i + 1}/{len(papers)}: {title}")
        task_results[title] = {}

        for task in tasks:
            prompt = generate_system_prompt(paper, task)
            model_name = TASK_MODELS.get(task, "gpt-5.1-mini")

            try:
                response = client.chat.completions.create(
                    model=model_name,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                    max_completion_tokens=800,
                )
                result = response.choices[0].message.content
                print(f"  {task} done")
                task_results[title][task] = result

            except Exception as e:
                print(f"  Error{task}: {e}")
                task_results[title][task] = f"error: {str(e)}"

        # Save incremental results after each paper
        save_incremental_results({title: task_results[title]}, output_file)

    return task_results


In [8]:
jsonl_path = "papers_extracted.jsonl"
papers = load_papers_from_jsonl(jsonl_path)

results = process_acm_domain_task(papers)


Skipping already processed paper: (Un)informed Consent  Studying GDPR Consent Notices in the Field.
Skipping already processed paper: (Un)linkable Pseudonyms for Governmental Databases.
Skipping already processed paper: -programme oblivisync-practical-oblivious-file-backup-and-synchronization
Skipping already processed paper: 00SEVen Re enabling Virtual Machine Forensics Introspecting Confidential VMs Using Privileged in VM Agents
Skipping already processed paper: 1 Trillion Dollar Refund  How To Spoof PDF Signatures.
Skipping already processed paper: 27 Years and 81 Million Opportunities Later Investigating the Use of Email Encryption for an Entire University
Skipping already processed paper: 28 Blinks Later  Tackling Practical Challenges of Eye Movement Biometrics.
Skipping already processed paper: 50 Shades of Support A Device Centric Analysis of Android Security Updates
Skipping already processed paper: 50 Ways to Leak Your Data An Exploration of Apps Circumvention of the Android P

In [9]:
import pandas as pd
import json

# File paths
acm_file = "acm_results_incremental.csv"
results_file = "results_final_2.0_update.csv"
output_file = "results_final_2.0_update_with_domains.csv"

# Load CSVs
acm_df = pd.read_csv(acm_file, encoding="utf-8", on_bad_lines='skip')
results_df = pd.read_csv(results_file, encoding="ISO-8859-1")

# Extract plain title string from JSON
def extract_title(raw):
    try:
        raw = str(raw).strip()
        if raw.startswith("```json"):
            raw = raw[7:].strip()
        if raw.startswith("```"):
            raw = raw[3:].strip()
        if raw.endswith("```"):
            raw = raw[:-3].strip()
        data = json.loads(raw)
        return data.get("title", "").strip()
    except Exception:
        return None

# Clean titles in both files
acm_df['parsed_title'] = acm_df['title'].apply(extract_title)
results_df['parsed_title'] = results_df['title'].apply(extract_title)

# Map title to raw domain string (not parsed, keep full JSON blob)
acm_df['raw_domain'] = acm_df['domain'].astype(str).str.strip()
title_to_raw_domain = dict(zip(acm_df['parsed_title'], acm_df['raw_domain']))

# Copy domain values by matching titles
matched_domains = []
for title in results_df['parsed_title']:
    domain = title_to_raw_domain.get(title, "")
    matched_domains.append(domain)

# Replace the 6th column (index 5 = domain)
results_df.iloc[:, 5] = matched_domains

# Drop helper column
results_df.drop(columns=['parsed_title'], inplace=True)

# Save output
results_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Domain data successfully merged into: {output_file}")


FileNotFoundError: [Errno 2] No such file or directory: 'acm_results_incremental.csv'