## Installations

In [None]:
# !pip install pdfplumber python-dotenv tqdm google-generativeai
# also you might need to run python in venv with the ipykernel, in order to work - set it as your interpreter
# python3 -m venv pyenv
# source pyenv/bin/activate
# pip install ipykernel jupyter

# du musst das api key in der .env datei speichern (schau .env.example an) - das kannst du hier erstellen: https://aistudio.google.com/apikey

## Imports

In [None]:
import os
import sys
import json
from pathlib import Path
import pdfplumber
from tqdm import tqdm
import openai
from dotenv import load_dotenv
import re

sys.path.append(os.path.abspath('..'))
from services import extraxt_service

## Loading env variables & Folders

In [9]:
load_dotenv()
openai.api_key = os.getenv("GEMINI_API_KEY")

In [10]:
from pathlib import Path

PROJECT_ROOT = Path("..") 
RAW_DIR = PROJECT_ROOT / "data/raw_data"
GT_DIR = PROJECT_ROOT / "data/ground_truth"
GEN_DIR = PROJECT_ROOT / "data/generated_data"

## GENAI function for summarization

In [11]:
import google.generativeai as genai
import os

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

def summarize_with_gemini(text):
    prompt = (
        "You will be given an academic article with the abstract and references removed. "
        "Summarize it in 4–5 concise sentences focusing on key contributions, methods, or findings:\n\n"
        + text
    )

    model = genai.GenerativeModel("gemini-1.5-flash-latest")

    response = model.generate_content(prompt)
    
    return response.text


## Loop to go through all files, extract the abstract, run the summ prompt and store results

In [12]:
pdf_files = list(RAW_DIR.glob("*.pdf"))

for pdf_path in tqdm(pdf_files):
    file_stem = pdf_path.stem

    try:
        with pdfplumber.open(pdf_path) as pdf:
            pages = [p.extract_text() for p in pdf.pages if p.extract_text()]
            full_text = "\n".join(pages)
            first_page_text = pages[0] if pages else ""

            abstract = extraxt_service.extract_abstract_from_first_page(first_page_text)
            if not abstract:
                print(f"Abstract not found in {file_stem}, skipping...")
                continue

            clean_text = extraxt_service.clean_body_text(full_text)

            summary = summarize_with_gemini(clean_text)

            with open(GT_DIR / f"{file_stem}.json", "w") as f:
                json.dump({"abstract": abstract}, f, indent=2)

            with open(GEN_DIR / f"{file_stem}.json", "w") as f:
                json.dump({"summary": summary}, f, indent=2)

    except Exception as e:
        print(f"Error processing {file_stem}: {e}")


100%|██████████| 1/1 [00:01<00:00,  1.90s/it]
