In [1]:
import ollama
import time
import requests
import itertools
from typing import List, Dict, Set, Tuple
import json
import os
import pandas as pd


AGENT_1 = 'llama3.2:3b'
BEARER_TOKEN = "insert your semantic scholar key here"
SS_HEADERS = {"Authorization": f"Bearer {BEARER_TOKEN}"}
KEYWORD_PERMUTATION = 20

In [2]:
def query_ollama_model(model_name, prompt_text):
    try:
        print(f"Sending prompt to Ollama model: {model_name}...")
        response = ollama.chat(
            model=model_name,
            messages=[
                {
                    'role': 'user',
                    'content': prompt_text,
                },
            ]
        )
        print("Received response from Ollama.")
        return response['message']['content']
    except Exception as e:
        return f"An error occurred while communicating with Ollama: {e}"

def get_full_keyword_list(model_name, initial_prompt, target_count=50):
    # Get initial keywords
    keywords_response = query_ollama_model(model_name, initial_prompt)

    if "An error occurred" in keywords_response:
        print("Error in initial keyword generation")
        return []

    # Process the keywords
    keywords = [kw.strip() for kw in keywords_response.split(',') if kw.strip()]
    unique_keywords = list(dict.fromkeys(keywords))  # Remove duplicates while preserving order

    print(f"Initial API call generated {len(unique_keywords)} unique keywords")

    # Continue requesting more keywords until we reach the target
    attempt = 1
    max_attempts = 5

    while len(unique_keywords) < target_count and attempt < max_attempts:
        current_count = len(unique_keywords)
        remaining = target_count - current_count

        print(f"Attempt {attempt}: Need {remaining} more keywords")

        # Create a reprompt that includes existing keywords
        reprompt = f"""Based on the following existing keywords about Climate Change and Deep Learning:

        {', '.join(unique_keywords)}

        Please generate {remaining} ADDITIONAL unique concise keywords/terms that are NOT in the above list. These should be at the intersection of Climate Change and Deep Learning.

        Output ONLY the new keywords as a comma-separated list with no additional text, numbering, or formatting.
        """

        # Get additional keywords
        additional_response = query_ollama_model(model_name, reprompt)

        if "An error occurred" not in additional_response:
            # Process and add new keywords
            new_keywords = [kw.strip() for kw in additional_response.split(',') if kw.strip()]

            # Add only unique new keywords
            for kw in new_keywords:
                if kw not in unique_keywords:
                    unique_keywords.append(kw)

            print(f"Added {len(unique_keywords) - current_count} new unique keywords")
        else:
            print("Error in additional keyword generation")

        attempt += 1

    # Trim to exact target count if we got more than needed
    if len(unique_keywords) > target_count:
        unique_keywords = unique_keywords[:target_count]

    return unique_keywords

def save_keywords_to_json(keywords, output_path="output/keywords.json"):
    """
    Saves the list of keywords to a JSON file.

    Args:
        keywords (list): List of keywords to save
        output_path (str): Path where the JSON file will be saved

    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Create the output directory if it doesn't exist
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Create a dictionary with the keywords
        keyword_data = {
            "count": len(keywords),
            "keywords": keywords
        }

        # Save to JSON file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(keyword_data, f, indent=2)

        print(f"Keywords successfully saved to {output_path}")
        return True

    except Exception as e:
        print(f"Error saving keywords to JSON: {e}")
        return False

if __name__ == "__main__":
    # The prompt for Agent 1: Paper Retriever to generate keywords
    # for the topic "Climate Change and Deep Learning"
    PROMPT_FOR_KEYWORDS = f"""As an expert keyword generator for academic research, your task is to produce a comprehensive list of keywords. These keywords are for querying the Semantic Scholar API to find papers at the intersection of <Topic>Climate Change and Deep Learning</Topic>. The goal is to uncover original and novel research directions.

    Instructions for keyword generation:
    1.  Focus on combining concepts from 'Climate Change' and 'Deep Learning'.
    2.  Aim for keywords that are specific and could lead to novel research areas or innovative applications.
    3.  The output MUST be a single string containing the keywords.
    4.  Each keyword or keyword phrase MUST be separated by a comma and a space. For example: 'keyword1, keyword2, keyword3'.
    5.  Do NOT use bullet points, numbered lists, introductory phrases (like "Here is a list..."), or any other formatting. Only provide the comma-separated list of keywords.

    Generate a diverse set of 50 of these keywords. The generated keywords should be strictly 50 and only 50. Can't be more or less. Example of desired output format: 'Climate modeling, climate data, Arctic anomaly detection, Deep Learning, Computer Vision,Climate Prediction'.
    """

    # --- Execution ---
    print("Starting keyword generation process...")

    # Get a full list of 50 keywords with reprompting if needed
    complete_keywords = get_full_keyword_list(AGENT_1, PROMPT_FOR_KEYWORDS, 50)

    print("\n--- Final List of Keywords ---")
    print(f"Total unique keywords: {len(complete_keywords)}")

    if complete_keywords:
        print("\nKeywords as a numbered list:")
        for i, kw in enumerate(complete_keywords):
            print(f"{i+1}. {kw}")

        save_keywords_to_json(complete_keywords)
    else:
        print("Failed to generate keywords.")

Starting keyword generation process...
Sending prompt to Ollama model: llama3.2:3b...
Received response from Ollama.
Initial API call generated 37 unique keywords
Attempt 1: Need 13 more keywords
Sending prompt to Ollama model: llama3.2:3b...
Received response from Ollama.
Added 14 new unique keywords

--- Final List of Keywords ---
Total unique keywords: 50

Keywords as a numbered list:
1. climate modeling
2. climate data
3. Arctic anomaly detection
4. deep learning
5. computer vision
6. climate prediction
7. machine learning
8. predictive analytics
9. climate change mitigation
10. renewable energy systems
11. sustainable development
12. green technology
13. artificial intelligence
14. climate modeling uncertainties
15. climate resilience
16. geoengineering
17. climate tipping points
18. weather forecasting
19. satellite imagery
20. ocean currents
21. sea level rise
22. climate adaptation strategies
23. carbon footprint analysis
24. deep learning for climate modeling
25. climate chang

In [3]:
# Function to generate keyword permutations
def generate_keyword_permutations(keywords: List[str], set_size: int = KEYWORD_PERMUTATION) -> List[str]:
    """Generate all permutations of keywords in sets of set_size, joined with OR."""
    if len(keywords) < set_size:
        print(f"Warning: Only {len(keywords)} keywords available, but requested sets of {set_size}")
        set_size = len(keywords)

    permutations = []
    # Use combinations rather than permutations since order doesn't matter for OR searches
    for combo in itertools.islice(itertools.combinations(keywords, set_size),8):
        query = " OR ".join(combo)
        permutations.append(query)

    print(f"Generated {len(permutations)} permutations")
    return permutations


# ——— 3) Function to search for papers with a given query ———
def search_papers(query: str, limit: int = 15) -> List[Dict]:
    """Search for papers using the given query."""
    SEARCH_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

    search_params = {
        "query": query,
        "limit": limit,
        "fields": "paperId,title,externalIds,abstract"
    }

    try:
        resp = requests.get(SEARCH_URL, params=search_params, headers=SS_HEADERS)
        resp.raise_for_status()
        return resp.json().get("data", [])
    except requests.exceptions.RequestException as e:
        print(f"Search error for query '{query}': {e}")
        return []


# ——— 4) Function to get paper abstract from different sources ———
def get_paper_abstract(paper: Dict) -> str:
    """Attempt to get abstract from Semantic Scholar, then OpenAlex if needed."""
    # Check if we already have the abstract
    abstract = paper.get("abstract")
    if abstract:
        return abstract

    # If not, try to fetch it from Semantic Scholar
    DETAIL_URL = "https://api.semanticscholar.org/graph/v1/paper/"
    pid = paper["paperId"]

    try:
        detail_params = {"fields": "abstract,externalIds"}
        r = requests.get(DETAIL_URL + pid, params=detail_params, headers=SS_HEADERS)
        r.raise_for_status()
        data = r.json()
        abstract = data.get("abstract")

        # If SS returned no abstract, try OpenAlex
        if not abstract:
            ext = data.get("externalIds") or paper.get("externalIds", {})
            oa_id = ext.get("OpenAlex") if ext else None

            if oa_id:
                oa_url = f"https://api.openalex.org/works/{oa_id}"
                oa_r = requests.get(oa_url)
                if oa_r.ok:
                    oa_data = oa_r.json()
                    inv_idx = oa_data.get("abstract_inverted_index") or {}
                    if inv_idx:
                        # Reconstruct plain text abstract from inverted index
                        pos_map = {}
                        for word, positions in inv_idx.items():
                            for pos in positions:
                                pos_map[pos] = word
                        # Build ordered list of words
                        abstract = " ".join(
                            pos_map[i] for i in range(len(pos_map))
                            if i in pos_map
                        )

        return abstract or "(no abstract available)"

    except requests.exceptions.RequestException as e:
        print(f"Error fetching abstract for paper {pid}: {e}")
        return "(error retrieving abstract)"


# ——— 5) Main function to run the entire process ———
def main():
    # Generate permutations
    query_permutations = generate_keyword_permutations(complete_keywords, set_size=4)

    # Track unique papers by ID to avoid duplicates
    unique_papers: Dict[str, Dict] = {}
    papers_per_query: Dict[str, Set[str]] = {}

    # Process each permutation
    for i, query in enumerate(query_permutations):
        print(f"Processing query {i + 1}/{len(query_permutations)}: {query}")

        # Search for papers
        papers = search_papers(query)
        papers_per_query[query] = set()

        for paper in papers:
            pid = paper["paperId"]
            papers_per_query[query].add(pid)

            # Skip if we already have this paper
            if pid in unique_papers:
                continue

            # Get the abstract if not already included
            if not paper.get("abstract"):
                paper["abstract"] = get_paper_abstract(paper)

            # Add to our collection of unique papers
            unique_papers[pid] = paper
        
        if len(unique_papers) == 15:
           break
        print(f"Found {len(papers)} papers, {len(unique_papers)} unique papers so far")

        # Respect rate limits (wait 1 second between queries)
        time.sleep(1)
    #save unique_papers as a csv file
    df = pd.DataFrame(unique_papers.values())
    df.to_csv("output/unique_papers.csv", index=False)
    # Print results
    print("\n" + "=" * 80)
    print(f"RESULTS: Found {len(unique_papers)} unique papers across {len(query_permutations)} queries")
    print("=" * 80 + "\n")

    with open("output/unique_paper.json", 'w', encoding='utf-8') as f:
            json.dump(unique_papers, f, indent=2)

    # Print each unique paper
    for i, (pid, paper) in enumerate(unique_papers.items()):
        title = paper.get("title", "(no title)")
        abstract = paper.get("abstract", "(no abstract available)")

        print(f"Paper {i + 1}/{len(unique_papers)}")
        print(f"ID:      {pid}")
        print(f"Title:   {title}")
        print(f"Abstract: {abstract}\n")
        print("─" * 80 + "\n")

    # Analyze which queries were most productive
    query_productivity = [(query, len(pids)) for query, pids in papers_per_query.items()]
    query_productivity.sort(key=lambda x: x[1], reverse=True)

    print("\nMost productive queries:")
    for query, count in query_productivity[:5]:
        print(f"- '{query}': {count} papers")


if __name__ == "__main__":
    main()


Generated 8 permutations
Processing query 1/8: climate modeling OR climate data OR Arctic anomaly detection OR deep learning
Search error for query 'climate modeling OR climate data OR Arctic anomaly detection OR deep learning': 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=climate+modeling+OR+climate+data+OR+Arctic+anomaly+detection+OR+deep+learning&limit=15&fields=paperId%2Ctitle%2CexternalIds%2Cabstract
Found 0 papers, 0 unique papers so far
Processing query 2/8: climate modeling OR climate data OR Arctic anomaly detection OR computer vision
Found 0 papers, 0 unique papers so far
Processing query 3/8: climate modeling OR climate data OR Arctic anomaly detection OR climate prediction
Search error for query 'climate modeling OR climate data OR Arctic anomaly detection OR climate prediction': 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=climate+modeling+OR+climate+data+OR+Arctic+anomaly+detection+OR+cl

: 

: 

: 

: 