In [25]:
# Import basis libraries
import pandas as pd
import json
import os
from PyPDF2 import PdfReader # Library to read PDFs
from PyPDF2.errors import PdfReadError
from IPython.display import display, Markdown
import time
import re
import shutil

# Import Google libraries
from google import genai
from google.genai import types
from IPython.display import Markdown


In [4]:
!pip install -qU google-generativeai python-dotenv pypdf2


# Setting up model

In [2]:
# Import setup function for Google GenAI client
# This should be in a separate file named 'google_setup.py'
try:
    # Assumes revised Google_setup.py that returns model, generate_func, count_func
    from google_setup import setup_genai_client
    print("Successfully imported setup_genai_client.")
except ImportError:
    print("ERROR: Could not import setup_genai_client.")
    print("Make sure 'Google_setup.py' (revised version) is in the same directory.")
    # Define dummy functions if import fails
    def setup_genai_client(model_name="dummy"):
        print("WARNING: Using dummy setup function. API calls and counts will not work.")
        class DummyModel:
            def count_tokens(self, text): return len(text.split()) # Very rough approximation
        def dummy_generate_text(prompt, config=None): return f"Dummy response for prompt: {prompt[:50]}..."
        def dummy_count_tokens(text): return len(str(text).split()) # Rough approximation
        return DummyModel(), dummy_generate_text, dummy_count_tokens

Successfully imported setup_genai_client.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Model pricing (info)
"""
Model	Input Price / 1M Tokens (USD)	Output Price / 1M Tokens (USD)	Estimated Input Cost (USD) (301.5k tokens)	Estimated Output Cost (USD) (3k tokens)	Total Estimated Cost (USD)
Gemini 2.5 Pro Preview	$1.25	$10.00	(0.3015 * 1.25) = $0.3769	(0.003 * 10.00) = $0.0300	~$0.4069
Gemini 2.0 Flash	$0.10	$0.40	(0.3015 * 0.10) = $0.0302	(0.003 * 0.40) = $0.0012	~$0.0314
Gemini 2.0 Flash-Lite	$0.075	$0.30	(0.3015 * 0.075) = $0.0226	(0.003 * 0.30) = $0.0009	~$0.0235
Gemini 1.5 Flash	$0.075	$0.30	(0.3015 * 0.075) = $0.0226	(0.003 * 0.30) = $0.0009	~$0.0235
Gemini 1.5 Flash-8B	$0.0375	$0.15	(0.3015 * 0.0375) = $0.0113	(0.003 * 0.15) = $0.0005	~$0.0118
Gemini 1.5 Pro	$1.25	$5.00	(0.3015 * 1.25) = $0.3769	(0.003 * 5.00) = $0.0150	~$0.3919
"""

In [5]:
# Define file path and model name
# Ensure these are set correctly for your environment
PDF_FILE_PATH = "Articles/1-s2.0-S0959475223000166-main.pdf" # <--- CONFIRM THIS PATH IS CORRECT
MODEL_NAME = "gemini-2.0-flash" # Use a current valid model name (gemini-2.0-flash might not exist)

# Initialize the Gemini text generation function
try:
    # setup_genai_client now returns only the function
    generate_text = setup_genai_client(model_name=MODEL_NAME)
    print(f"Gemini client setup complete using model: {MODEL_NAME}")
    # Ensure generate_text is callable if needed for later checks
    if not callable(generate_text):
         print("Warning: setup_genai_client did not return a callable function.")
         generate_text = None # Set to None if setup technically succeeded but didn't return function

except ValueError as e: # Catch API key error specifically if needed
     print(f"FATAL: Could not set up Gemini Client (check API KEY?). Error: {e}")
     generate_text = None # Ensure it's None if setup fails
except Exception as e:
    print(f"FATAL: Could not set up Gemini Client. Exiting. Error: {e}")
    generate_text = None # Ensure it's None if setup fails

# Check if setup succeeded before proceeding
if generate_text is None:
    print("\n*** Setup failed. Cannot proceed with API calls. Please check errors above. ***")
else:
    print("\nSetup successful. Ready to process PDF.")

Successfully initialized model: gemini-2.0-flash
Gemini client setup complete using model: gemini-2.0-flash

Setup successful. Ready to process PDF.


In [9]:
# Helper function to read PDF files
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from all pages of a PDF file using PyPDF2.

    Args:
        pdf_path (str): The full path to the PDF file.

    Returns:
        str: The concatenated text extracted from all pages,
             or an empty string if the file cannot be read or no text is found.

    Raises:
        FileNotFoundError: If the pdf_path does not exist.
        PdfReadError: If PyPDF2 encounters issues reading the PDF structure.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"Error: PDF file not found at {pdf_path}")

    print(f"Reading PDF: {os.path.basename(pdf_path)}...")
    text = ""
    try:
        reader = PdfReader(pdf_path)
        num_pages = len(reader.pages)
        print(f"Found {num_pages} pages.")
        for i, page in enumerate(reader.pages):
            try:
                page_text = page.extract_text()
                if page_text:  # Add text only if extraction was successful
                    text += page_text + "\n\n"  # Add page separator
                # else:
                    # Optional: Log pages with no text
                    # print(f"  Note: No text extracted from page {i+1}.")
            except Exception as page_error:
                # Log errors extracting text from specific pages but continue
                print(f"Warning: Could not extract text from page {i+1}. Error: {page_error}")
        print(f"Finished text extraction attempt.")
        return text
    except PdfReadError as e:
        # Specific error for PDF reading issues (corruption, encryption etc.)
        print(f"Error reading PDF structure (maybe corrupted, encrypted, or non-standard): {e}")
        raise  # Re-raise this error as it likely prevents further processing
    except Exception as e:
        # Catch other potential errors during PdfReader initialization etc.
        print(f"An unexpected error occurred during PDF reading initialization: {e}")
        raise # Re-raise


# Prompts

In [None]:
OVERARCHING_PROMPT = """You are an academic data extractor assistant. Your task is to extract specific information from the provided text of an academic article based on the user's prompt.
Rules:
1. Respond **only** with the extracted information requested by the prompt. Do **not** include any introductory phrases, explanations, or conversational text like 'Here is the title:', 'The authors are:', 'Based on the text:', 'The answer is:', etc.
2. Use **British English** spelling throughout your response (e.g., 'analyse', 'centre', 'behaviour').
3. Specifically use the word '**competences**' instead of 'competencies' whenever the concept arises.
4. Provide plain text answers. Do not use markdown formatting like bolding or bullet points unless the extracted text itself contains them."""

prompts = {
    "Title": "Directly state the exact title of the study from the provided text.",
    #"Author": "List the full names of the authors for this study from the provided text, separated by commas or as listed.",
    "Abstract": "Provide the complete abstract of the study from the provided text.",
    "Journal": "Directly state the name of the journal, working paper series, or other publication source mentioned in the text.",
    #"Country": "List the specific EU country or countries (excluding EFTA members Iceland, Liechtenstein, Norway, Switzerland) where the study was conducted, as mentioned in the text. If multiple, separate by commas. If none mentioned or only EFTA countries, state 'None EU specified'.",
    #"Year": "Directly state the year of publication extracted from the text.",
    #"Educational Level": "Directly state the most specific educational level(s) targeted or studied (e.g., ECEC/Preschool, Primary, Secondary, Tertiary/Higher Education).",
    "Method": """Identify the study's core methodology based on whether it uses a counterfactual design aiming to estimate a causal effect.
- If the text describes random assignment of participants to treatment and control groups, classify as 'Randomised experiment (RCT)'.
- If the text describes a non-random method designed to estimate a causal effect by creating a comparison group (look for terms like Difference-in-Differences, Regression Discontinuity, Instrumental Variables, Propensity Score Matching, Interrupted Time Series, matching, comparison group without randomisation), classify as 'Quasi-experimental'. If a specific method like 'Propensity Score Matching' is mentioned, state 'Quasi-experimental (Propensity Score Matching)'. If the type is unclear but a non-random comparison is used, state 'Quasi-experimental (Other QE)'.
- If the study is primarily descriptive, correlational, theoretical, qualitative without a focus on estimating causal effects via comparison groups, or a review/meta-analysis not focused on counterfactual estimation, classify as 'Not a counterfactual study'.
Provide only the final classification category name.""",
    #"Policy area": "List the main policy area(s) addressed (e.g., competences and skills, private education, career guidance, disadvantaged students, migrant students, institutional aspects, education drop-out/completion/participation, digital tools, teaching profession). Separate multiple areas with commas.",
    "Specific type of educational intervention or policy evaluated in the study": "Directly describe the specific educational intervention, program, policy, or phenomenon evaluated or analysed in the study.",
    "Core findings": "Summarise the main conclusions and core findings reported in the text. Include key results regarding effects, comparisons, and positive/negative/null findings. Present as a single block of text.",
    "Target population": "Directly describe the specific group(s) or entities the study focused on, including key characteristics mentioned (age, role, location, etc.).",
    "Outcome domain": "Identify the broad category or domain of the primary outcome(s) measured in the study (e.g., academic achievement, interaction quality, enrollment, student retention). Provide only the domain name(s).", # Removed the confusing examples from the prompt itself
    "Effect Summary": "Provide a concise summary integrating the direction (Positive, Negative, Mixed, None), magnitude (effect size, % change, score difference), and statistical significance (p-value, CI) of the main effect(s). Present as a single block of text.",
    "Implementation duration": "Directly describe the duration of the intervention or observation period mentioned in the text.",
    "Sample size (description)": "Provide a brief description of the sample size, including the units counted (e.g., teachers, pupils, schools).",
    "Sample size (raw number)": "Directly state the primary raw number representing the total sample size for the main analysis.",
    "Publication Reference": "Extract the publication reference details (Journal/Source, volume, issue, pages) as available in the text.",
    "Study limitation": "List the methodological limitations or constraints acknowledged by the authors or evident in the text. Separate distinct limitations. Focus on validity/generalisability aspects, not null findings.",
    "Tagging system": "List any keywords, tags, or subject classifications provided in the text, separated by commas."
    # "Weblink": "Extract any DOI or URL weblink provided in the text." 
}

#prompts = {"Title": "Just write test", "Year": "The year should be 2023"}

# Extract data

In [14]:
# Define a function to extract text from a PDF file
def process_single_pdf(pdf_path, prompts_dict, generate_text_func, system_prompt=""):
    """
    Reads a PDF, applies multiple prompts using the Gemini API via generate_text_func,
    prepending a system prompt to each call, and returns a dictionary of extracted data.

    Args:
        pdf_path (str): The full path to the PDF file.
        prompts_dict (dict): A dictionary where keys are field names and values
                             are the corresponding specific task prompts.
        generate_text_func (function): The function that calls the Gemini API.
        system_prompt (str, optional): General instructions for the model
                                       to prepend to each specific prompt. Defaults to "".

    Returns:
        dict: A dictionary containing the extracted data for each field,
              or None if the PDF cannot be read. Keys match the prompts_dict keys.
              Values can include error messages if API calls fail for specific fields.
    """
    # --- Basic Checks ---
    if not callable(generate_text_func):
        print("Error: Invalid generate_text_func provided to process_single_pdf.")
        return None
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file not found at {pdf_path}")
        return {"filename": os.path.basename(pdf_path), "status": "Error - File not found"}

    # --- Step 1: Extract text from PDF ---
    pdf_filename = os.path.basename(pdf_path)
    print(f"\n--- Reading PDF: {pdf_filename} ---")
    try:
        pdf_text = extract_text_from_pdf(pdf_path)
        if not pdf_text:
            print(f"Warning: No text extracted from '{pdf_filename}'. Cannot process.")
            return {"filename": pdf_filename, "status": "Error - No text extracted"}
        print(f"Successfully extracted text from {pdf_filename}.")
    except Exception as e:
        print(f"Error reading PDF '{pdf_filename}': {e}")
        return {"filename": pdf_filename, "status": f"Error - PDF Read Failed: {e}"}

    # --- Step 2: Process Prompts using Gemini ---
    extracted_data = {"filename": pdf_filename}
    num_prompts = len(prompts_dict)
    print(f"--- Starting API calls for {pdf_filename} ({num_prompts} fields) ---")

    for i, (field, specific_prompt) in enumerate(prompts_dict.items()):
        print(f"  Processing field {i+1}/{num_prompts}: {field}...")

        # *** MODIFICATION: Construct the full prompt including the system prompt ***
        full_prompt = f"{system_prompt}\n\nTASK:\n{specific_prompt}\n\nDOCUMENT TEXT:\n---\n{pdf_text}\n---"

        # Call the API using the provided function
        try:
            response_text = generate_text_func(prompt=full_prompt)
            response_text = response_text.strip()
            # Basic cleanup for potential markdown code blocks
            if response_text.startswith("```"):
                 lines = response_text.split('\n')
                 if len(lines) > 1 and lines[-1] == "```": # Check if it has start/end ```
                     # Try to extract text between ```, removing potential language identifier
                     response_text = "\n".join(lines[1:-1]).strip()
                 elif len(lines) > 0: # If only start ```, remove it and potential lang identifier
                      response_text = response_text.lstrip('`').strip()
                      # Find first newline if present after initial ```
                      first_newline = response_text.find('\n')
                      if first_newline != -1:
                          response_text = response_text[first_newline:].strip()


            extracted_data[field] = response_text
            print(f"    Success.")

        except Exception as e:
            print(f"    ERROR calling Gemini API for field '{field}': {e}")
            extracted_data[field] = f"API Error: {e}"

        # Optional: time.sleep(1)

    print(f"--- Finished processing prompts for {pdf_filename} ---")
    return extracted_data

In [26]:
# --- Configuration ---
PDF_FOLDER_PATH = "Articles/"
DESTINATION_FOLDER_PATH = "Articles_done/"
OUTPUT_EXCEL_FILENAME = "Extracted_Article_Data.xlsx"

# --- Main Batch Processing Logic ---

if 'generate_text' not in locals() or not callable(generate_text):
    print("\nERROR: 'generate_text' function is not defined or setup failed.")
    print("Please run the Gemini client setup cell successfully first.")
else:
    if not os.path.isdir(PDF_FOLDER_PATH):
        print(f"Error: Source PDF folder not found at '{PDF_FOLDER_PATH}'")
    else:
        # --- Create Destination Folder ---
        try:
            os.makedirs(DESTINATION_FOLDER_PATH, exist_ok=True)
            print(f"Ensured destination folder exists: '{DESTINATION_FOLDER_PATH}'")
        except OSError as e:
            print(f"Error creating destination folder '{DESTINATION_FOLDER_PATH}': {e}")
            # Optionally exit if destination can't be created
            # exit()

        # Discover PDF files
        pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(PDF_FOLDER_PATH, f))]
        pdf_files.sort()

        if not pdf_files:
            print(f"No PDF files found in folder: '{PDF_FOLDER_PATH}'")
        else:
            print(f"Found {len(pdf_files)} PDF files to process.")

            all_results = [] # To store extracted data for Excel

            # Loop through each PDF file
            for pdf_filename in pdf_files:
                source_pdf_path = os.path.join(PDF_FOLDER_PATH, pdf_filename)

                # Process the single PDF using the existing function
                extracted_data = process_single_pdf(
                    pdf_path=source_pdf_path,
                    prompts_dict=prompts,
                    generate_text_func=generate_text,
                    system_prompt=OVERARCHING_PROMPT
                )

                # Add the result (even if it's an error dictionary) for Excel export
                if extracted_data:
                    # Always add filename from the source, not the potentially failing extracted data
                    extracted_data['original_filename'] = pdf_filename
                    all_results.append(extracted_data)

                    # --- Rename and Move Logic (Only if processing was successful) ---
                    if not isinstance(extracted_data, dict) or not extracted_data.get("status", "").startswith("Error"):
                        print(f"  Successfully processed data for '{pdf_filename}'. Attempting to rename and move...")
                        try:
                            # Extract Year
                            year_text = extracted_data.get("Year", "")
                            year_match = re.search(r'\b(19|20)\d{2}\b', str(year_text)) # Convert to str just in case
                            parsed_year = year_match.group(0) if year_match else "UnknownYear"

                            # Extract and Sanitize Title
                            title_text = extracted_data.get("Title", "")
                            sanitized_title = sanitize_filename(title_text) if title_text else "UnknownTitle"

                            # Handle cases where title might still be empty after sanitizing
                            if not sanitized_title:
                                sanitized_title = "UnknownTitle"

                            # Construct new filename
                            new_pdf_name = f"{parsed_year} - {sanitized_title}.pdf"
                            destination_pdf_path = os.path.join(DESTINATION_FOLDER_PATH, new_pdf_name)

                            # --- Move and Rename ---
                            print(f"    Moving '{source_pdf_path}' to '{destination_pdf_path}'")
                            shutil.move(source_pdf_path, destination_pdf_path)
                            print(f"    Successfully moved and renamed '{pdf_filename}'.")

                        except KeyError as e:
                            print(f"    Warning: Could not find key '{e}' in extracted data for '{pdf_filename}'. Cannot rename/move.")
                        except OSError as e:
                            print(f"    Error moving/renaming file '{pdf_filename}': {e}. Check permissions or if file is open.")
                        except Exception as e:
                            print(f"    An unexpected error occurred during renaming/moving of '{pdf_filename}': {e}")
                    else:
                        # Processing failed, don't move the file
                        print(f"  Processing failed for '{pdf_filename}'. File will not be renamed or moved.")
                        # Log the error status if available
                        if isinstance(extracted_data, dict):
                             print(f"  Reason: {extracted_data.get('status', 'Unknown processing error')}")

                else:
                     print(f"  Processing returned no result for '{pdf_filename}'. File will not be moved.")


            # --- Post-Processing: Save aggregated data to Excel ---
            if not all_results:
                print("\nNo results were generated to save.")
            else:
                print("\nConverting extracted data to DataFrame...")
                df = pd.DataFrame(all_results)

                # Define desired column order (match keys in prompts + original_filename + status)
                desired_columns = ['original_filename'] + list(prompts.keys()) + ['status']
                actual_columns = [col for col in desired_columns if col in df.columns]
                df = df[actual_columns] # Reorder/filter columns

                # Construct static output filepath
                output_filepath = os.path.join(os.getcwd(), OUTPUT_EXCEL_FILENAME) # Save in current dir

                # Save DataFrame to Excel
                try:
                    print(f"\nSaving extracted data to Excel file: '{OUTPUT_EXCEL_FILENAME}'...")
                    df.to_excel(output_filepath, index=False, engine='openpyxl')
                    print(f"Successfully saved results to {output_filepath}")
                except Exception as e:
                    print(f"\nError saving results to Excel: {e}")
                    # Add CSV fallback if needed
                    # output_csv_filepath = os.path.join(os.getcwd(), OUTPUT_EXCEL_FILENAME.replace('.xlsx', '.csv'))
                    # try:
                    #    df.to_csv(output_csv_filepath, index=False)
                    #    print(f"Successfully saved results as CSV to {output_csv_filepath}")
                    # except Exception as csv_e:
                    #    print(f"Error saving as CSV: {csv_e}")

Ensured destination folder exists: 'Articles_done/'
Found 1 PDF files to process.

--- Reading PDF: 1-s2.0-S0959475223000166-main.pdf ---
Reading PDF: 1-s2.0-S0959475223000166-main.pdf...
Found 13 pages.
Finished text extraction attempt.
Successfully extracted text from 1-s2.0-S0959475223000166-main.pdf.
--- Starting API calls for 1-s2.0-S0959475223000166-main.pdf (2 fields) ---
  Processing field 1/2: Title...
    Success.
  Processing field 2/2: Year...
    Success.
--- Finished processing prompts for 1-s2.0-S0959475223000166-main.pdf ---
  Successfully processed data for '1-s2.0-S0959475223000166-main.pdf'. Attempting to rename and move...
    Moving 'Articles/1-s2.0-S0959475223000166-main.pdf' to 'Articles_done/2023 - test.pdf'
    Successfully moved and renamed '1-s2.0-S0959475223000166-main.pdf'.

Converting extracted data to DataFrame...

Saving extracted data to Excel file: 'Extracted_Article_Data.xlsx'...
Successfully saved results to c:\Users\stephanie.creteur\Coding\ENESET d