In [35]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes as bnb
import pandas as pd
import numpy as np
from utils_gen import Rephraser, to_csv


In [36]:
from dotenv import load_dotenv
import os

# Load the API key from .env file
load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')
# google_model_name = "gemini-2.0-flash"
# google_model_name = "gemma-3-27b-it"
# google_model_name = "gemini-2.5-flash-preview-04-17"
google_model_name = "gemini-2.5-flash-preview-05-20"

# openai_api_key = os.getenv('OPENAI_API_KEY')
# openai_model_name = "gpt-4o-mini"

 
# hf_api_key = os.getenv("HUGGINGFACE_API_KEY")


# Gemini

In [70]:
from google import genai
from google.genai import types

class GoogleAIClient(Rephraser):
    def __init__(self, api_key, model_name):
        super().__init__()
        self.client = genai.Client(api_key=api_key)
        # self.client = genai.Client()
        self.model_name = model_name
            
    
    def gen(self, text_to_rephrase, query_task, temperature):
        query = self.compute_query(text_to_rephrase, query_task)
        # print(f"Query: {query}")
        # query = compute_costum_query(text_to_rephrase, query_task)
        
        response = self.client.models.generate_content(
            model= self.model_name,
            contents=query,
            config=types.GenerateContentConfig(
                thinking_config = types.ThinkingConfig(
                    thinking_budget=0,
                ),
                temperature= temperature,
                max_output_tokens= 4096 ,
                candidate_count=1,
                response_mime_type="text/plain",
                system_instruction=[ # NOT AVAIALBE FOR GEMMA
                    types.Part.from_text(
                    text="""Oferă un singur răspuns"""
                    ),
                ],
            ),
        )
        # print(response)
        if response.candidates[0].finish_reason.name != 'STOP':
            raise Exception("Bad response")
        
        return response.text

# OLD 

## OpenAI

In [4]:
import openai

class ChatGPTClient(Rephraser):
    def __init__(self, api_key, model_name):
        openai.api_key = api_key
        self.model_name = model_name

    def reprase(self, text, text_to_rephrase):
        query = (
            "Rescrie urmatorul text dintr-o lucrare de doctorat:\n"
            + text_to_rephrase + '\n'
        )
        response = openai.Completion.create(
            model=self.model_name,
            prompt=query,
            max_tokens=1500,
            n=1,
            stop=None,
            temperature=0.7,
        )
        return response.choices[0].text.strip()


## RoLLama

In [51]:
class RoLlama3Rephraser:
    def __init__(self):
        self.model_id = "OpenLLM-Ro/RoLlama3-8b-Instruct-4bit"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)

        self.model = AutoModelForCausalLM.from_pretrained(self.model_id).to("cuda")

    def reprase(self, text, text_to_rephrase):
        query = (
            "Rescrie urmatorul text dintr-o lucrare de doctorat:\n"
            + text_to_rephrase
            + "\n"
        )
        inputs = self.tokenizer.encode(
            query, return_tensors="pt", padding=True, truncation=True, max_length=2048
        ).to("cuda")
        outputs = self.model.generate(input_ids=inputs)
        output = self.tokenizer.decode(outputs[0])
        return output

## Llama

In [52]:
class LlamaRepahser:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit")
        self.model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit")


    def reprase(self, text, text_to_rephrase):
        query = (
            "Rescrie urmatorul text dintr-o lucrare de doctorat:\n"
            + text_to_rephrase + '\n'
        )
        chat = [
            {"role": "system", "content": "Raspunde cu un singur raspuns fara a furniza infromatii suplimentare."},
            {"role": "user", "content": query},
        ]
        prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, system_message="")
        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
        outputs = self.model.generate(input_ids=inputs)
        output = self.tokenizer.decode(outputs[0])
        return output

## Mistral

In [53]:
class MistralRephraser:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("OpenLLM-Ro/RoMistral-7b-Instruct")
        self.model = AutoModelForCausalLM.from_pretrained("OpenLLM-Ro/RoMistral-7b-Instruct")

    def reprase(self, text, text_to_rephrase):
        query = (
            "Rescrie urmatorul text dintr-o lucrare de doctorat:\n"
            + text_to_rephrase + '\n'
        )
        chat = [
            {"role": "system", "content": "Raspunde cu un singur raspuns fara a furniza infromatii suplimentare."},
            {"role": "user", "content": query},
        ]
        prompt = self.tokenizer.apply_chat_template(chat, tokenize=False, system_message="")
        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to("cuda")
        outputs = self.model.generate(input_ids=inputs)
        output = self.tokenizer.decode(outputs[0])
        return output

# gen

In [71]:
api_client = GoogleAIClient(google_api_key, google_model_name)
# rephraser = Rephraser()
# rephraser = RoLlama3Rephraser()
# rephraser = LlamaRepahser()
# rephraser = MistralRephraser()

In [39]:
chapters = pd.read_csv("../dataframe_2800-5600ch_marker_chapters_FINAL_GEMINI.csv")
chapters = chapters.dropna()
# print a few random paragraphs from the dataframe paragraphs
# for i in range(5):
#     print(chapters.iloc[i]["sample"])
#     print("-" * 50)
#     print()

In [40]:
print(len(chapters))
print(chapters.columns)

46675
Index(['original_index', 'document_id', 'sample', 'title'], dtype='object')


In [14]:
# paragraphs = paragraphs.sample(5)
# len(paragraphs)

In [41]:
chapters.head()

Unnamed: 0,original_index,document_id,sample,title
0,0,10589,În cadrul programului de cercetare dezvoltat s...,PREFAȚĂ
1,2,10589,Deșeul reprezintă partea dintr-un material sau...,2.1 Noțiuni de bază
2,4,10589,În urma diferitelor etape de flux tehnologic d...,3.2 Prezentarea generală a caracteristicilor c...
3,6,10589,"În secolul XX, în mod deosebit în a doua jumăt...",3.2.1.7 Zgură de oțelărie
4,8,10589,Peletizarea minereurilor de fier este cunoscut...,4.2.1 Producerea peletelor


In [85]:
# done 0 - 22000 
# 22.000 - 24400 testing
# start_range = 24400
# end_range = 26000
start_range= 21000
end_range=26000

In [28]:
# # single request
# index = 23900
# row = chapters.iloc[index]
# original_index = row["original_index"]
# chapter = row["sample"]
# try:
#     generated = api_client.gen(chapter, 'non_ai_doctorat', 1)
#     print(generated)

# except Exception as e:
#     print(f"Error processing row index {index} (original_index {original_index}): {e}")
#     generated = f"ERROR: {e}"

candidates=[Candidate(content=Content(parts=[Part(video_metadata=None, thought=None, inline_data=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, text='La fel ca în cazul clasicilor, și romanticii se orientează spre cultura și spațiul elen, însă într-o manieră cu totul diferită. Ei sunt atrași de orfism, eleatism, miturile individualității, ruine, de căldura soarelui și a peisajelor, de culorile și atmosfera Orientului. Această evadare în exotic atinge apogeul spre finele secolului al XIX-lea și începutul secolului al XX-lea, când simbolistul, nemulțumit de realitatea imediată, aspiră spre lumi alternative prin visare și proiecția dorințelor în imaginar.\n\nO dezvoltare a temei onirice o regăsim în „Fiica haosului” de Duiliu Zamfirescu, o creație ce se poate asemăna cu un „luceafăr întors”, unde „seduția transcendentului se manifestă prin și în vis”, iar incompatibilitatea dintre personaje își găsește rezolvarea prin ui

In [86]:
def get_gap_indexes(start_range, end_range):
    worked = pd.read_csv("../FINAL_CHATGPT_21k_26k_non_ai_doc_3_20250529_170158.csv")
    original_indexes_worked = worked["original_index"].tolist()
    
    chapters_original_indexes = chapters["original_index"].values.tolist()
    chapters_original_indexes = chapters_original_indexes[start_range:end_range]
    gap_indexes = []
    print(len(chapters_original_indexes))
    print(len(original_indexes_worked))
    
    for index in chapters_original_indexes:
        if index not in original_indexes_worked or pd.isna(worked[worked['original_index'] == index]['generated'].iloc[0]):
            gap_indexes.append(index)
    return gap_indexes

gap_indexes = get_gap_indexes(start_range, end_range)
print("Gap indexes:")
print(gap_indexes[:50])
print(len(gap_indexes))
index_in_chapters = chapters[chapters["original_index"].isin(gap_indexes)].index.tolist()
print(index_in_chapters[:50])

5000
5000
Gap indexes:
[]
0
[]


In [74]:
import time
import pandas as pd
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

print("Using model: ", api_client.model_name)
print("Generating from index: ", start_range, " to ", end_range)

generated_results = []
temperature_max_change = 0.2
chunk_size = 50  # Define the number of rows to process before saving
max_workers = 15 # Number of concurrent API calls
queries = ["rephrase", "summarize", "continue"] #  ["non_ai_doctorat", "non_ai_doctorat_summary", "non_ai_doctorat_continue"] #

def process_row(index):
    """Processes a single row to generate text using the API client."""
    row = chapters.iloc[index]
    original_index = row["original_index"]
    chapter = row["sample"]

    temperature = 1 + np.random.uniform(-temperature_max_change, temperature_max_change)
    query_id = index % len(queries) 
    query_task = queries[query_id]
    
    try:
        generated = api_client.gen(chapter, query_task, temperature)

    except Exception as e:
        print(f"Error processing row index {index} (original_index {original_index}): {e}")
        generated = f"ERROR: {e}"

    time.sleep(1) # Optional delay to avoid hitting API limits
    return {
        "original_index": original_index,
        "document_id": row["document_id"],
        "title": row["title"],
        "task": query_task,
        "chapter": chapter,
        "generated": generated,
        "temperature": temperature,
    }

# Use ThreadPoolExecutor for concurrent API calls
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # futures = {executor.submit(process_row, i): i for i in range(start_range, end_range)}
    futures = {executor.submit(process_row, i): i for i in index_in_chapters}
    processed_count = 0

    for future in as_completed(futures):
        index = futures[future]
        try:
            result = future.result()
            generated_results.append(result)
            processed_count += 1

            # Save in chunks
            if processed_count % chunk_size == 0:
                print(f"Processed {processed_count} rows (up to index {index}), saving chunk to CSV...")
                generated_data_chunk = pd.DataFrame(generated_results)
                to_csv(generated_data_chunk, f"gemini/FINAL_GEMINI_gap_{start_range}_{end_range}_{processed_count}.csv")


        except Exception as exc:
            print(f'Index {index} generated an exception: {exc}')

# Save any remaining results after the loop finishes
if generated_results:
    print(f"Saving remaining {len(generated_results)} results...")
    generated_data = pd.DataFrame(generated_results)
    to_csv(generated_data, f"gemini/FINAL_GEMINI_gap_{start_range}_{end_range}_final.csv")
                

print(f"Processing complete.")


Using model:  gemini-2.5-flash-preview-05-20
Generating from index:  0  to  20000
Error processing row index 1662 (original_index 3324): 'NoneType' object is not subscriptable
Processed 50 rows (up to index 8028), saving chunk to CSV...
DataFrame successfully saved to gemini/FINAL_GEMINI_gap_0_20000_50_20250529_161701.csv
Processed 100 rows (up to index 11997), saving chunk to CSV...
DataFrame successfully saved to gemini/FINAL_GEMINI_gap_0_20000_100_20250529_161718.csv
Error processing row index 14491 (original_index 28982): 'NoneType' object is not subscriptable
Error processing row index 14976 (original_index 29952): 'NoneType' object is not subscriptable
Error processing row index 15216 (original_index 30432): 'NoneType' object is not subscriptable
Error processing row index 15217 (original_index 30434): 'NoneType' object is not subscriptable
Error processing row index 15659 (original_index 31318): 'NoneType' object is not subscriptable
Error processing row index 15662 (original_in

In [11]:
len(generated_data_chunk) # non_ai_doctorat_3_FINAL_GEMINI_gap_21000_24400_final_20250529_150828.csv


12

In [None]:
# # merge the two dataframes
# df1 = pd.read_csv(f"gemini/FINAL_GEMINI_gap_21000_24400_final_20250529_150828.csv")
# df2 = pd.read_csv("gemini/non_ai_doctorat_3_FINAL_GEMINI_24400_26000_final_20250521_154714.csv")
# df = pd.concat([df1, df2], ignore_index=True)
# to_csv(df, "../non_ai_doctorat_3_FINAL_GEMINI.csv")

DataFrame successfully saved to ../non_ai_doctorat_3_FINAL_GEMINI_20250529_154848.csv


#### merge dataframes for gaps

In [None]:
# # Create a copy of non_dup to avoid modifying the original
# merged_df = non_dup.copy()

# # Iterate through gap indices
# for idx in gap:
#     # Get the row from gap_df for this index
#     gap_row = gap_df[gap_df['original_index'] == idx]
    
#     if idx in merged_df['original_index'].values:
#         # Update existing row with gap_df values
#         merged_df.loc[merged_df['original_index'] == idx] = gap_row.values[0]
#     else:
#         # Append new row from gap_df
#         merged_df = pd.concat([merged_df, gap_row], ignore_index=True)

# # Sort by original_index to maintain order
# merged_df = merged_df.sort_values('original_index')

# # Save the merged dataframe
# # to_csv(merged_df, "../FINAL_GEMINI_0_20000_merged_gaps.csv")


# [deprecated] check output

In [12]:
import pandas as pd

results_df = pd.read_csv("gemini/non_ai_doctorat_FINAL_GEMINI_24400_24412_final_20250521_152903.csv")
print(len(results_df))

12


In [13]:
# sort results by original_index
results_df = results_df.sort_values(by=["original_index"])
# save the sorted dataframe
# results_df.to_csv("../FINAL_GEMINI_sorted.csv", index=False)

In [14]:
results_df.columns


Index(['original_index', 'document_id', 'title', 'task', 'chapter',
       'generated', 'temperature'],
      dtype='object')

In [15]:
print(f"Number of null values in DataFrame: {results_df.isna().sum().sum()}")
results_df.head()

Number of null values in DataFrame: 0


Unnamed: 0,original_index,document_id,title,task,chapter,generated,temperature
6,48800,15155,4.2 Monitorizarea unor emițători radio VLF în ...,non_ai_doctorat_summary,O altă campanie de măsurători pe care am desfă...,"Am efectuat o campanie de măsurători, pe parcu...",0.985573
8,48802,15155,4.4 Monitorizarea nivelului de semnal recepțio...,non_ai_doctorat_continue,În perioada aprilie-mai și în perioada iulie-a...,"Această diferență semnificativă, de ordinul a ...",1.148788
5,48804,15155,5.1.2 Echipamentul destinat achiziției de imagini,non_ai_doctorat,Pentru achiziția datelor inițiale am conceput ...,Pentru a obține datele inițiale necesare studi...,0.877582
1,48806,15155,5.2.1 Considerații preliminarii,non_ai_doctorat_summary,"Descărcările electrice din atmosferă (fulgere,...",**Localizarea Fulgerelor: De la Studiul Prelim...,1.192679
2,48808,15155,6.2 Obiective atinse și rezultatele aplicative...,non_ai_doctorat_continue,"În cadrul Proiectul de cercetare științifică, ...","- Un al treilea obiectiv, la fel de important,...",0.942256


In [16]:
# Iterate through the results DataFrame and print original vs rephrased text
for index, row in results_df.iterrows():
    original_index = row['original_index']
    document_id = row['document_id']
    original_chapter = row['chapter']
    generated = row['generated']
    title = row['title']
    task = row['task']
    temperature = row['temperature']

    print(f"Original index: {original_index} ----------------------------------")
    print(f"Text from document ID: {document_id}")
    print(f"Title: {title}")
    print(f"Task: {task}")
    print(f"Temperature: {temperature}")
    print('-'*50)
    print(f"Original: \n{original_chapter}")
    
    if task == "continue":
        text = original_chapter.split('\n')
        text = '\n'.join(text[:len(text) // 2])
        print(f"Used text: \n{text}")
        
    print('|'*50)
    print(f"Generated: \n{generated}")
    print('-'*50)
    print() # Add a blank line for better separation

    # Optional: Write to file (currently commented out)
    # with open("rephrases.txt", "a", encoding="utf-8") as f:
    #     f.write(f"Text from document ID: {document_id}\n")
    #     f.write(f"Task: {task}\n")
    #     f.write('-'*50 + '\n')
    #     f.write(f"Original: \n{original_paragraph}\n")
    #     f.write('|'*50 + '\n')
    #     f.write(f"Generated: \n{rephrased_paragraph}\n")
    #     f.write('-'*50 + '\n\n')

Original index: 48800 ----------------------------------
Text from document ID: 15155
Title: 4.2 Monitorizarea unor emițători radio VLF în vederea punerii în evidență a perturbațiilor subite ale ionosferei, produse de activitatea solară
Task: non_ai_doctorat_summary
Temperature: 0.9855725440145292
--------------------------------------------------
Original: 
O altă campanie de măsurători pe care am desfășurat-o timp de mai multe luni a fost dedicată punerii în evidență a perturbațiilor subite ale ionosferei, produse de activitatea solară.

Soarele este o stea variabilă. Această variabilitate se manifestă printr-o intensificare a activității solare urmată de o perioadă de acalmie. Analizând observațiile astronomice făcute timp de câteva sute de ani se poate constata o ciclicitate a activității solare cu o perioadă de aproximativ 11 ani.

Activitatea solară este resimțită în întreg sistemul solar prin degajarea continuă a unor cantități imense de materie și energie. Emisia radiativă comu

In [37]:
# count how many nan are in the generated row
print(f"Number of NaN values in 'generated' column: {results_df['generated'].isna().sum()}")

Number of NaN values in 'generated' column: 75


# old code

In [12]:
start_index = 11050
end_index = 11060 #11549


In [13]:
# import random
# from nltk.tokenize import sent_tokenize

# class RephraserHelper:
#     def __init__(self, rephraser, number_threshold, length_threshold, sentences_threshold, n_sentences):
#         self.rephraser = rephraser
#         self.number_threshold = number_threshold
#         self.length_threshold = length_threshold
#         self.sentences_threshold = sentences_threshold
#         self.n_sentences = n_sentences

#     def filter_out(self, to_be_rephrased): 
#         # If the text is too short, don't rephrase
#         if len(to_be_rephrased) < self.length_threshold:
#             return True
        
#         # If the text contains a lot of numbers, don't rephrase
#         if sum(1 for char in to_be_rephrased if char.isdigit()) > self.number_threshold:
#             print("Numbers", sum(1 for char in to_be_rephrased if char.isdigit()))
#             return True
        
#         return False

#     def reprphrase(self, file_path, n_times):
#         with open(file_path, "r", encoding="utf-8") as file:
#             text = file.read()
            
#         sentences = sent_tokenize(text)
        
#         # If the text is too short, don't rephrase
#         if len(sentences) < self.sentences_threshold:
#             return 
        
#         cnt = 0
#         for _ in range(n_times):
#             start = random.randint(0, len(sentences) - self.n_sentences)
            
#             to_be_rephrased = " ".join(sentences[start : start + self.n_sentences])

#             if self.filter_out(to_be_rephrased):
#                 print("Bad text:", to_be_rephrased)
#                 continue
            
#             reprased_part = self.rephraser.reprase(text, to_be_rephrased)
#             # print(reprased_part)
#             # reprased_text = text.replace(to_be_rephrased, reprased_part)
            
#             ans_text = "Original text: " + to_be_rephrased + "\nRephrased text: " + reprased_part
            
#             # with open(file_path[:-3] + "_sample_.txt", 'a', encoding='utf-8') as file:
#             #     file.write(ans_text + "\n\n")
            
#             print(ans_text)
#             cnt += 1
