In [1]:
!pip install requests pandas



In [2]:
!pip install langdetect



In [3]:
import requests
import pandas as pd
import time  # For handling API rate limits
import os
from google.colab import files
from datetime import datetime
from langdetect import detect, LangDetectException # detect = language identifier, LangDetectException = error if detection fails

# Load API key from an environment variable for security
SS_API = os.getenv("SEMANTIC_SCHOLAR_API_KEY", "Insert Sematic Scholar API Key")

# Get the current year and define the past 20 years range
current = datetime.now().year  # Get the current year
start = current- 20  # Define the start year as 20 years ago

# Function to fetch AI/ML research papers from Semantic Scholar with pagination
def googleScholar(query, max_results=1000):  # Allows fetching up to 1000 results per query
    url = "https://api.semanticscholar.org/graph/v1/paper/search"  # API endpoint
    headers = {"x-api-key": SS_API}  # API authentication header as it mentioned by API provider

    papers = []  # store paper list
    total_fetched = 0  # Counter
    offset = 0  # Track offset

    # Loop until we fetch the required number of results
    while total_fetched < max_results:
        params = {
            "query": query,  # Search query term
            "limit": min(100, max_results - total_fetched),  # Fetch up to 100 at a time (Predefined max number by Google)
            "offset": offset,  # offset to fetch next batch of results
            "fields": "title,abstract,authors,year,url,citationCount,journal,venue,publicationTypes", # Lables of dataset
            "year": f"{start}-{current}"  # Fetch only from the last 20 years
        }

        response = requests.get(url, headers=headers, params=params)  # Make API request

        # Handle API rate limits and errors with exponential backoff
        if response.status_code == 429:  # If API rate limit is exceeded
            wait_time = 2  # Initial wait time in seconds
            while response.status_code == 429:  # Keep retrying until allowed
                print(f"Rate limit exceeded. Waiting for {wait_time} seconds...")
                time.sleep(wait_time)  # Wait before retrying
                wait_time *= 2  # Increase wait time (exponential backoff)
                response = requests.get(url, headers=headers, params=params)  # Retry API request

        elif response.status_code != 200:  # Handle other errors
            print(f"Error fetching data: {response.status_code} - {response.text}")
            break  # Stop execution on API error

        data = response.json()  # Convert API response to JSON format
        papers_fetched = data.get("data", [])  # Extract paper data from response

        # If no more results, stop fetching
        if not papers_fetched:
            print(f"No more results for query: {query}")
            break

        for paper in papers_fetched:
            year = paper.get("year", 0)
            if start <= year <= current:
               title = paper.get("title", "")
               abstract = paper.get("abstract", "")

            try:
               if detect(title) != "en":
                    continue
               if abstract and detect(abstract) != "en":
                continue
            except LangDetectException:
                continue  # skip if language detection fails

            papers.append({
                "Title": title or "N/A",
                "Abstract": abstract or "N/A",
                "Authors": ", ".join([author["name"] for author in paper.get("authors", [])]),
                "Year": year,
                "URL": paper.get("url", "N/A"),
                "Citations": paper.get("citationCount", "N/A"),
                "Journal": (paper.get("journal") or {}).get("name", "N/A"),
                "Venue": paper.get("venue", "N/A"),
                "Publication Types": ", ".join(paper.get("publicationTypes", []) or [])
            })

        total_fetched += len(papers_fetched)  # Update the total fetched count
        offset += len(papers_fetched)  # Move the offset forward

        # Respect API rate limits by adding a small delay
        time.sleep(1)

    return papers  # Return the list of fetched papers

# List of queries to fetch AI & ML research papers
queries = [
    # General AI & ML
    "artificial intelligence",
    "machine learning",
    "deep learning",
    "reinforcement learning",
    "supervised learning",
    "unsupervised learning",
    "semi-supervised learning",
    "self-supervised learning",
    "neural networks",
    "transformer models",
    "generative AI",
    "AI applications",
    "AI in healthcare",
    "AI in finance",
    "AI in robotics",
    "AI in education",
    "machine learning algorithms",
    "automated machine learning",
    "AI ethics",
    "explainable AI",
    "AI safety",
    "AI governance",

    # Core Algorithms & Techniques
    # Supervised
    "linear regression",
    "logistic regression",
    "decision trees",
    "random forest classifier",
    "support vector machines",
    "naive Bayes classifier",
    "k-nearest neighbors classifier",
    "gradient boosting machines",
    "XGBoost classifier",
    "LightGBM classifier",
    "CatBoost classifier",
    "ridge regression",
    "lasso regression",
    "elastic net regression",
    "multi-class classification",
    "ordinal regression",
    "ensemble methods in machine learning",
    "stacking ensemble learning",
    "bagging and boosting",
    "regression trees",
    "classification algorithms",
    "cross-validation techniques",

    # Unsupervised
    "k-means clustering",
    "hierarchical clustering",
    "density-based clustering",
    "DBSCAN algorithm",
    "Gaussian mixture models",
    "mean shift clustering",
    "spectral clustering",
    "affinity propagation",
    "self-organizing maps",
    "principal component analysis PCA",
    "independent component analysis ICA",
    "t-SNE visualization",
    "UMAP dimensionality reduction",
    "autoencoders for representation learning",
    "deep clustering methods",
    "latent Dirichlet allocation LDA",
    "topic modeling",
    "anomaly detection",
    "outlier detection algorithms",

    # Deep Learning & Neural Architectures
    "convolutional neural networks",
    "recurrent neural networks",
    "long short-term memory",
    "transformers",
    "attention mechanisms",
    "vision transformers",
    "GANs generative adversarial networks",
    "BERT model",
    "GPT models",
    "diffusion models",
    "multi-modal learning",
    "zero-shot learning",
    "few-shot learning",
    "meta learning",
    "neural architecture search",

    # Libraries & Frameworks
    "TensorFlow machine learning",
    "PyTorch deep learning",
    "Scikit-learn algorithms",
    "Keras deep learning",
    "Hugging Face transformers",
    "JAX ML library",
    "ONNX AI models",

    # Evaluation, Fairness, and Interpretability
    "model evaluation in machine learning",
    "model interpretability",
    "model explainability",
    "fairness in machine learning",
    "bias in AI models",
    "AUC ROC evaluation",
    "precision recall tradeoff",
    "SHAP values",
    "LIME explainability",

    # AI Research Topics & Trends
    "foundation models",
    "large language models",
    "AI and climate change",
    "AI for social good",
    "neurosymbolic AI",
    "human-in-the-loop learning",
    "online learning",
    "continual learning",
    "federated learning",
    "privacy preserving machine learning",
    "causal inference in ML",
    "contrastive learning",
    "representation learning"
]


all_papers = []  # List to store papers from all queries

# Loop through each query and fetch research papers
for query in queries:
    papers = googleScholar(query, max_results=1000)  # Fetch papers for the given query
    all_papers.extend(papers)  # Append fetched papers to the main list

# Convert fetched data to a Pandas DataFrame
df = pd.DataFrame(all_papers)

# Save data as a CSV file
file_name = "Google Scholar AI&ML Papers.csv"
df.to_csv(file_name, index=False)

# Download the file if user use Google Colab to run the code. Otherwise, will be downloaded at the root
files.download(file_name)

df.head()

Error fetching data: 400 - {"error":"Requested data for this limit and/or offset is not available"}



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Title,Abstract,Authors,Year,URL,Citations,Journal,Venue,Publication Types
0,High-performance medicine: the convergence of ...,,E. Topol,2019,https://www.semanticscholar.org/paper/f134abea...,4135,Nature Medicine,Nature Network Boston,"Review, JournalArticle"
1,Explainable Artificial Intelligence (XAI): Con...,,"Alejandro Barredo Arrieta, Natalia Díaz Rodríg...",2019,https://www.semanticscholar.org/paper/530a059c...,5944,Inf. Fusion,Information Fusion,"JournalArticle, Review"
2,Explanation in Artificial Intelligence: Insigh...,,Tim Miller,2017,https://www.semanticscholar.org/paper/e89dfa30...,4120,Artif. Intell.,Artificial Intelligence,"JournalArticle, Review"
3,Sparks of Artificial General Intelligence: Ear...,Artificial intelligence (AI) researchers have ...,"Sébastien Bubeck, Varun Chandrasekaran, Ronen ...",2023,https://www.semanticscholar.org/paper/8dbd5746...,2854,ArXiv,arXiv.org,JournalArticle
4,Peeking Inside the Black-Box: A Survey on Expl...,At the dawn of the fourth industrial revolutio...,"Amina Adadi, M. Berrada",2018,https://www.semanticscholar.org/paper/21dff47a...,3709,IEEE Access,IEEE Access,"JournalArticle, Review"
