In [35]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes as bnb
import pandas as pd
import numpy as np
from utils_gen import Rephraser, to_csv


In [36]:
from dotenv import load_dotenv
import os

# Load the API key from .env file
load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')
# google_model_name = "gemini-2.0-flash"
# google_model_name = "gemma-3-27b-it"
# google_model_name = "gemini-2.5-flash-preview-04-17"
google_model_name = "gemini-2.5-flash-preview-05-20"

# openai_api_key = os.getenv('OPENAI_API_KEY')
# openai_model_name = "gpt-4o-mini"

 
# hf_api_key = os.getenv("HUGGINGFACE_API_KEY")


# Gemini

In [70]:
from google import genai
from google.genai import types

class GoogleAIClient(Rephraser):
    def __init__(self, api_key, model_name):
        super().__init__()
        self.client = genai.Client(api_key=api_key)
        # self.client = genai.Client()
        self.model_name = model_name
            
    
    def gen(self, text_to_rephrase, query_task, temperature):
        query = self.compute_query(text_to_rephrase, query_task)
        # print(f"Query: {query}")
        # query = compute_costum_query(text_to_rephrase, query_task)
        
        response = self.client.models.generate_content(
            model= self.model_name,
            contents=query,
            config=types.GenerateContentConfig(
                thinking_config = types.ThinkingConfig(
                    thinking_budget=0,
                ),
                temperature= temperature,
                max_output_tokens= 4096 ,
                candidate_count=1,
                response_mime_type="text/plain",
                system_instruction=[ # NOT AVAIALBE FOR GEMMA
                    types.Part.from_text(
                    text="""Oferă un singur răspuns"""
                    ),
                ],
            ),
        )
        # print(response)
        if response.candidates[0].finish_reason.name != 'STOP':
            raise Exception("Bad response")
        
        return response.text

# Generate text with Gemini

In [71]:
api_client = GoogleAIClient(google_api_key, google_model_name)

In [39]:
chapters = pd.read_csv("../dataframe_2800-5600ch_marker_chapters_FINAL_GEMINI.csv")
chapters = chapters.dropna()

In [40]:
print(len(chapters))
print(chapters.columns)

46675
Index(['original_index', 'document_id', 'sample', 'title'], dtype='object')


In [85]:
start_range= 21000
end_range=26000

In [74]:
import time
import pandas as pd
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

print("Using model: ", api_client.model_name)
print("Generating from index: ", start_range, " to ", end_range)

generated_results = []
temperature_max_change = 0.2
chunk_size = 50  # Define the number of rows to process before saving
max_workers = 15 # Number of concurrent API calls
queries = ["rephrase", "summarize", "continue"] #  ["non_ai_doctorat", "non_ai_doctorat_summary", "non_ai_doctorat_continue"] #

def process_row(index):
    """Processes a single row to generate text using the API client."""
    row = chapters.iloc[index]
    original_index = row["original_index"]
    chapter = row["sample"]

    temperature = 1 + np.random.uniform(-temperature_max_change, temperature_max_change)
    query_id = index % len(queries) 
    query_task = queries[query_id]
    
    try:
        generated = api_client.gen(chapter, query_task, temperature)

    except Exception as e:
        print(f"Error processing row index {index} (original_index {original_index}): {e}")
        generated = f"ERROR: {e}"

    time.sleep(1) # Optional delay to avoid hitting API limits
    return {
        "original_index": original_index,
        "document_id": row["document_id"],
        "title": row["title"],
        "task": query_task,
        "chapter": chapter,
        "generated": generated,
        "temperature": temperature,
    }

# Use ThreadPoolExecutor for concurrent API calls
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_row, i): i for i in range(start_range, end_range)}
    processed_count = 0

    for future in as_completed(futures):
        index = futures[future]
        try:
            result = future.result()
            generated_results.append(result)
            processed_count += 1

            # Save in chunks
            if processed_count % chunk_size == 0:
                print(f"Processed {processed_count} rows (up to index {index}), saving chunk to CSV...")
                generated_data_chunk = pd.DataFrame(generated_results)
                to_csv(generated_data_chunk, f"gemini/FINAL_GEMINI_gap_{start_range}_{end_range}_{processed_count}.csv")


        except Exception as exc:
            print(f'Index {index} generated an exception: {exc}')

# Save any remaining results after the loop finishes
if generated_results:
    print(f"Saving remaining {len(generated_results)} results...")
    generated_data = pd.DataFrame(generated_results)
    to_csv(generated_data, f"gemini/FINAL_GEMINI_gap_{start_range}_{end_range}_final.csv")
                

print(f"Processing complete.")


Using model:  gemini-2.5-flash-preview-05-20
Generating from index:  0  to  20000
Error processing row index 1662 (original_index 3324): 'NoneType' object is not subscriptable
Processed 50 rows (up to index 8028), saving chunk to CSV...
DataFrame successfully saved to gemini/FINAL_GEMINI_gap_0_20000_50_20250529_161701.csv
Processed 100 rows (up to index 11997), saving chunk to CSV...
DataFrame successfully saved to gemini/FINAL_GEMINI_gap_0_20000_100_20250529_161718.csv
Error processing row index 14491 (original_index 28982): 'NoneType' object is not subscriptable
Error processing row index 14976 (original_index 29952): 'NoneType' object is not subscriptable
Error processing row index 15216 (original_index 30432): 'NoneType' object is not subscriptable
Error processing row index 15217 (original_index 30434): 'NoneType' object is not subscriptable
Error processing row index 15659 (original_index 31318): 'NoneType' object is not subscriptable
Error processing row index 15662 (original_in