# Inputs

In [130]:
# Write the search query for searching Semantic scholar
search_query = '(LLM | LLMs | "Large Language Models") ("thematic analysis" | "qualitative analysis")'
# Provide Gemini API keys within as strings the list. (For API key rotation, to avoid rate limits) eg. ["api_key_1", ...]
gemini_api_keys = []
# Provide research objectives (Used within the prompt for screening)
research_objectives = "To study the trends in automation in thematic analysis or qualitative analysis using LLMs"
# Provide the criteria by which the LLM should decide whether to accept the paper or not.
decision_criteria = "ONLY accept the papers that discuss using LLMs for aiding or automating the process of thematic analysis or qualitative analysis. Be very selective."

# Data collection

In [1]:
import requests

In [41]:
bulk_search_url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"
bulk_paper_detail_url = "https://api.semanticscholar.org/graph/v1/paper/batch"

In [31]:
params = {
    "query": search_query
}

In [33]:
response = requests.get(bulk_search_url, params=params)
papers_result = {}
if response.status_code == 200:
    papers_result = response.json()
    print("Success!")
else:
    print("Error:", response.status_code, response.text)

In [35]:
papers = papers_result.get("data", [])

In [47]:
paper_ids = [paper.get("paperId", None) for paper in papers]
params = {
    "fields": 'url,title,abstract,publicationDate,fieldsOfStudy,journal,authors,tldr,citationCount,referenceCount,externalIds'
}

In [50]:
response = requests.post(bulk_paper_detail_url, json={"ids": paper_ids}, params=params)
detailed_result = {}
if response.status_code == 200:
    detailed_result = response.json()
    print("Success!")
else:
    print("Error:", response.status_code, response.text)

Success!


In [52]:
detailed_result[0]

{'paperId': '005eff447a1d7f915ea4b48ca17f430c37745b90',
 'externalIds': {'ArXiv': '2305.12050',
  'DBLP': 'journals/pacmse/MuraliMABCGFNR24',
  'DOI': '10.1145/3643774',
  'CorpusId': 258832882},
 'url': 'https://www.semanticscholar.org/paper/005eff447a1d7f915ea4b48ca17f430c37745b90',
 'title': 'AI-Assisted Code Authoring at Scale: Fine-Tuning, Deploying, and Mixed Methods Evaluation',
 'abstract': 'Generative LLMs have been shown to effectively power AI-based code authoring tools that can suggest entire statements or blocks of code during code authoring. In this paper we present CodeCompose, an AI-assisted code authoring tool developed and deployed at Meta internally. CodeCompose is based on the InCoder LLM that merges generative capabilities with bi-directionality. We have scaled up CodeCompose to serve tens of thousands of developers at Meta, across 9 programming languages and several coding surfaces. We present our experience in making design decisions about the model and system ar

# AI screening

In [94]:
from google import genai
from google.genai import types
import os
import time
import json
import random
import csv

In [95]:
def chunked(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

def build_prompt(papers):
    prompt = (
        f"""You are a research assistant helping screen academic papers for a specific research goal.\n
        Here are 5 papers. For each paper, decide whether it aligns with the research objectives. 
        Research objectives are: {research_objectives}
        You should return your decision as true if it meets the following criteria: {decision_criteria}
        Give a general overview ('thoughts'), a binary decision, whether to include the paper to study or not ('decision' = true/false), and an optional note.\n\n
        """
    )
    for idx, paper in enumerate(papers, start=1):
        prompt += (
            f"Paper {idx}:\n"
            f"Title: {paper.get('title', 'N/A')}\n"
            f"Abstract: {paper.get('abstract', 'N/A')}\n\n"
            f"tldr: {paper.get('tldr', {}).get('text', 'N/A')}\n\n"
        )
    prompt += (
        "Respond with a JSON list of 5 objects, each with keys: 'thoughts', 'decision', and 'note', "
        "in the same order as the papers."
    )
    return prompt

In [110]:
CSV_FILE = "/kaggle/working/screened_papers.csv"

In [102]:
processed_ids = set()
if os.path.exists(CSV_FILE):
    with open(CSV_FILE, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        processed_ids = {row["paperId"] for row in reader}

In [103]:
MAX_RETRIES = 5
INITIAL_BACKOFF = 2

with open(CSV_FILE, "a", newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["paperId", "thoughts", "decision", "note"])
    if os.stat(CSV_FILE).st_size == 0:
        writer.writeheader()

    unprocessed = [paper for paper in detailed_result if paper['paperId'] not in processed_ids]

    for i, batch in enumerate(chunked(unprocessed, 3)):
        retries = 0
        while retries < MAX_RETRIES:
            try:
                key_index = i % len(gemini_api_keys)
                genai_client = genai.Client(api_key=gemini_api_keys[key_index])
                model = "gemini-2.0-flash"
                prompt_text = build_prompt(batch)
                contents = [
                    types.Content(
                        role="user",
                        parts=[
                            types.Part(text=prompt_text),
                        ],
                    ),
                ]

                generate_content_config = types.GenerateContentConfig(
                        response_mime_type="application/json",
                        response_schema=genai.types.Schema(
                            type = genai.types.Type.ARRAY,
                            items = genai.types.Schema(
                                type = genai.types.Type.OBJECT,
                                required = ["thoughts", "decision"],
                                properties = {
                                    "thoughts": genai.types.Schema(
                                        type = genai.types.Type.STRING,
                                    ),
                                    "decision": genai.types.Schema(
                                        type = genai.types.Type.BOOLEAN,
                                    ),
                                    "note": genai.types.Schema(
                                        type = genai.types.Type.STRING,
                                    ),
                                },
                            ),
                        ),
                    )


                result = genai_client.models.generate_content(
                    model=model,
                    contents=contents,
                    config=generate_content_config
                )

                parsed_result = result.text
                response_json = json.loads(parsed_result) if isinstance(parsed_result, str) else parsed_result
                
                for paper, response in zip(batch, response_json):
                    writer.writerow({
                        "paperId": paper['paperId'],
                        "thoughts": response.get("thoughts", ""),
                        "decision": response.get("decision", ""),
                        "note": response.get("note", ""),
                    })
                    csvfile.flush()

                print(f"Batch {i + 1} processed and saved.")
                break  # exit retry loop

            except Exception as e:
                print(f"Error in batch {i + 1}, retrying ({retries + 1}/{MAX_RETRIES})...")
                print(f"Exception: {e}")
                time.sleep(INITIAL_BACKOFF * (2 ** retries) + random.uniform(0, 1))
                retries += 1

Batch 1 processed and saved.
Batch 2 processed and saved.
Batch 3 processed and saved.
Batch 4 processed and saved.
Batch 5 processed and saved.
Batch 6 processed and saved.
Batch 7 processed and saved.
Batch 8 processed and saved.
Batch 9 processed and saved.
Batch 10 processed and saved.
Batch 11 processed and saved.
Batch 12 processed and saved.
Batch 13 processed and saved.
Batch 14 processed and saved.
Batch 15 processed and saved.
Batch 16 processed and saved.
Batch 17 processed and saved.
Batch 18 processed and saved.
Batch 19 processed and saved.
Batch 20 processed and saved.
Batch 21 processed and saved.
Batch 22 processed and saved.
Batch 23 processed and saved.
Batch 24 processed and saved.
Batch 25 processed and saved.
Batch 26 processed and saved.
Batch 27 processed and saved.
Batch 28 processed and saved.
Batch 29 processed and saved.
Batch 30 processed and saved.
Batch 31 processed and saved.
Batch 32 processed and saved.
Batch 33 processed and saved.
Batch 34 processed 

In [78]:
import pandas as pd

In [111]:
df1 = pd.read_csv(CSV_FILE)

In [113]:
df1.head()

Unnamed: 0,paperId,title,abstract,thoughts,decision,note
0,005eff447a1d7f915ea4b48ca17f430c37745b90,AI-Assisted Code Authoring at Scale: Fine-Tuni...,Generative LLMs have been shown to effectively...,This paper uses thematic analysis to understan...,True,Thematic analysis is used as an evaluation met...
1,023b852cafd6bcc8dcb0f60e2513a597c0b3c793,"“Here the GPT made a choice, and every choice ...",,This paper discusses students' critical engage...,False,Thematic analysis is not the focus.
2,02bcbd9c4e3d3dc2ccad4c61c27e21a50c765f62,LLMCode: Evaluating and Enhancing Researcher-A...,The use of large language models (LLMs) in qua...,"The paper introduces LLMCode, a tool to assess...",True,Focuses on AI assistance in qualitative analysis.
3,02ce9267dbfa1df73b0a1b1e66f5ce6697d5e3b8,Small Language Models can Outperform Humans in...,"In this paper, we evaluate the creative fictio...",This paper compares creative writing abilities...,False,No thematic analysis involved.
4,0364dc93966e4e81453aa3886ff5813327af01c4,The use of large language models for qualitati...,Machine-assisted approaches for free-text anal...,"This paper presents DECOTA, a novel machine le...",True,Directly automates thematic analysis.


# Merging Data

In [120]:
import csv
import os

def flatten_paper_data(paper):
    flat_data = {}

    flat_data['paperId'] = paper.get('paperId', '')
    flat_data['title'] = paper.get('title', '')
    flat_data['abstract'] = paper.get('abstract', '')
    flat_data['url'] = paper.get('url', '')
    flat_data['referenceCount'] = paper.get('referenceCount', 0)
    flat_data['citationCount'] = paper.get('citationCount', 0)
    flat_data['publicationDate'] = paper.get('publicationDate', '')

    if 'externalIds' in paper and isinstance(paper['externalIds'], dict):
        flat_data['DOI'] = paper['externalIds'].get('DOI', '')
    else:
        flat_data['DOI'] = ''

    if 'journal' in paper and isinstance(paper['journal'], dict):
        flat_data['journal_name'] = paper['journal'].get('name', '')
        flat_data['journal_pages'] = paper['journal'].get('pages', '')
        flat_data['journal_volume'] = paper['journal'].get('volume', '')
    else:
        flat_data['journal_name'] = ''
        flat_data['journal_pages'] = ''
        flat_data['journal_volume'] = ''

    if 'openAccessPdf' in paper and isinstance(paper['openAccessPdf'], dict):
        flat_data['openAccessPdf_url'] = paper['openAccessPdf'].get('url', '')
    else:
        flat_data['openAccessPdf_url'] = ''

    # Extract authors (combine names into a single string)
    # Making this section more robust as well, similar to fieldsOfStudy
    authors_value = paper.get('authors')
    author_names_list = []
    if isinstance(authors_value, list):
        for author_entry in authors_value:
            if isinstance(author_entry, dict):
                author_names_list.append(author_entry.get('name', ''))
            # Optional: if author entries could be just strings in the list
            # elif isinstance(author_entry, str):
            #     author_names_list.append(author_entry)
    flat_data['authors'] = '; '.join(author_names_list)

    # TLDR
    if 'tldr' in paper and isinstance(paper['tldr'], dict):
      flat_data['tldr_text'] = paper['tldr'].get('text','')
    else:
      flat_data['tldr_text'] = ''

    # --- CORRECTED FieldsOfStudy ---
    fieldsOfStudy_value = paper.get('fieldsOfStudy') # Get the actual value (could be list, None, or something else)
    
    if isinstance(fieldsOfStudy_value, list):
        # Ensure all elements are strings and filter out any None elements within the list
        string_elements = [str(item) for item in fieldsOfStudy_value if item is not None]
        flat_data['fieldsOfStudy'] = '; '.join(string_elements)
    elif isinstance(fieldsOfStudy_value, str): # If it's a single string, use it as is
        flat_data['fieldsOfStudy'] = fieldsOfStudy_value
    else: # Handles None or any other unexpected non-list, non-string types by defaulting to an empty string
        flat_data['fieldsOfStudy'] = ''
    # --- END CORRECTION ---

    return flat_data

def write_papers_to_csv(papers, csv_file):
    # Define fieldnames based on what flatten_paper_data produces
    # It's good practice to derive this from a sample flattened dict or ensure consistency
    # For now, using the manually defined list from your code.
    fieldnames = [
        'paperId', 'title', 'abstract', 'url', 'referenceCount', 'citationCount',
        'publicationDate', 'DOI', 'journal_name', 'journal_pages', 'journal_volume',
        'openAccessPdf_url', 
        'authors', 'tldr_text', 'fieldsOfStudy'
    ]
    # Ensure all keys produced by flatten_paper_data are in fieldnames,
    # or handle missing keys gracefully if that's a possibility.

    file_exists = os.path.isfile(csv_file)

    with open(csv_file, "a", newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore') # 'ignore' unknown fields

        if not file_exists or os.stat(csv_file).st_size == 0:
            writer.writeheader()

        for paper_item in papers: # Renamed 'paper' to 'paper_item' to avoid conflict with module name
            if paper_item is None: # Add a check for None paper items in the list
                print("Warning: Found a None item in the papers list, skipping.")
                continue
            flat_data = flatten_paper_data(paper_item)
            writer.writerow(flat_data)
            
CSV_FILE = "/kaggle/working/papers.csv" # Define your CSV file name
if 'detailed_result' in locals() or 'detailed_result' in globals():
    write_papers_to_csv(detailed_result, CSV_FILE) 
    print(f"Data written to {CSV_FILE}")
else:
    print("Error: 'detailed_result' is not defined. Please ensure it contains your paper data.")

Data written to /kaggle/working/papers.csv


In [127]:
df2 = pd.read_csv(CSV_FILE)

merged_df = pd.merge(df1, df2, on=['paperId'],
    how='inner')

In [128]:
merged_df.to_csv("output.csv", index = False)

In [None]:
merged_df