# Input

In [None]:
# File path
BIB_FILE = "/kaggle/input/acm-thematic/acm.bib"
# Provide Gemini API keys within as strings the list. (For API key rotation, to avoid rate limits)
gemini_api_keys = []
# Provide research objectives (Used within the prompt for screening)
research_objectives = "To study the trends in automation in thematic analysis or qualitative analysis using LLMs"
# Provide the criteria by which the LLM should decide whether to accept the paper or not.
decision_criteria = "Only accept papers where LLMs or AI tools are directly used for aiding or conducting thematic analysis or qualitative analysis"

# Loading data

In [None]:
!pip install bibtexparser -q

In [None]:
import bibtexparser
import pandas as pd

In [None]:
with open(BIB_FILE, encoding="utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

entries = bib_database.entries

In [None]:
df1 = pd.DataFrame(entries)

# AI Screening

In [None]:
from google import genai
from google.genai import types
import os
import time
import json
import random
import csv

In [None]:
def chunked(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

def build_prompt(papers):
    prompt = (
        f"""You are a research assistant helping screen academic papers for a specific research goal.\n
        Here are a few papers. For each paper, decide whether it aligns with the research objectives and meets the criteria. 
        Research objectives are: {research_objectives}
        You should return your decision as true if it meets the following criteria: {decision_criteria}
        Give a general overview ('thoughts'), a binary decision, whether to include the paper to study or not ('decision' = true/false), and an optional note.\n\n
        """
    )
    for idx, paper in enumerate(papers, start=1):
        if isinstance(paper, dict):

            prompt += (
                f"Paper {idx}:\n"
                f"Title: {paper.get('title', 'N/A')}\n"
                f"Abstract: {paper.get('abstract', 'N/A')}\n\n"
                f"keywords: {paper.get('keywords', 'N/A')}\n\n"
            )
    prompt += (
        "Respond with a JSON list of objects, each with keys: 'thoughts', 'decision', and 'note', "
        "in the same order as the papers."
    )
    return prompt


In [None]:
CSV_FILE = "/kaggle/working/screened_papers.csv"

In [None]:
processed_ids = set()
if os.path.exists(CSV_FILE):
    with open(CSV_FILE, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        processed_ids = {row["ID"] for row in reader}

In [None]:
MAX_RETRIES = 5
INITIAL_BACKOFF = 2

with open(CSV_FILE, "a", newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["ID", "thoughts", "decision", "note"])
    if os.stat(CSV_FILE).st_size == 0:
        writer.writeheader()

    unprocessed = [paper for paper in entries if paper['ID'] not in processed_ids]

    for i, batch in enumerate(chunked(unprocessed, 3)):
        retries = 0
        while retries < MAX_RETRIES:
            try:
                key_index = i % len(gemini_api_keys)
                genai_client = genai.Client(api_key=gemini_api_keys[key_index])
                model = "gemini-2.0-flash"
                prompt_text = build_prompt(batch)
                contents = [
                    types.Content(
                        role="user",
                        parts=[
                            types.Part(text=prompt_text),
                        ],
                    ),
                ]

                generate_content_config = types.GenerateContentConfig(
                        response_mime_type="application/json",
                        response_schema=genai.types.Schema(
                            type = genai.types.Type.ARRAY,
                            items = genai.types.Schema(
                                type = genai.types.Type.OBJECT,
                                required = ["thoughts", "decision"],
                                properties = {
                                    "thoughts": genai.types.Schema(
                                        type = genai.types.Type.STRING,
                                    ),
                                    "decision": genai.types.Schema(
                                        type = genai.types.Type.BOOLEAN,
                                    ),
                                    "note": genai.types.Schema(
                                        type = genai.types.Type.STRING,
                                    ),
                                },
                            ),
                        ),
                    )


                result = genai_client.models.generate_content(
                    model=model,
                    contents=contents,
                    config=generate_content_config
                )

                parsed_result = result.text
                response_json = json.loads(parsed_result) if isinstance(parsed_result, str) else parsed_result
                
                for paper, response in zip(batch, response_json):
                    writer.writerow({
                        "ID": paper['ID'],
                        "thoughts": response.get("thoughts", ""),
                        "decision": response.get("decision", ""),
                        "note": response.get("note", ""),
                    })
                    csvfile.flush()

                print(f"Batch {i + 1} processed and saved.")
                break  # exit retry loop

            except Exception as e:
                print(f"Error in batch {i + 1}, retrying ({retries + 1}/{MAX_RETRIES})...")
                print(f"Exception: {e}")
                time.sleep(INITIAL_BACKOFF * (2 ** retries) + random.uniform(0, 1))
                retries += 1

# Merging Data

In [None]:
df2 = pd.read_csv(CSV_FILE)

df2.head()

In [None]:
merged_df = pd.merge(df1, df2, on=['ID'], how='inner')
merged_df.to_csv("output.csv", index = False)

In [None]:
merged_df