In [None]:
#install Google’s Gemini Python SDK to use Gemini API
!pip install google-generativeai

#import libraries
import os #to work with file paths
import glob #finds files by looking for patterns (*.txt)
import time  #for rate limiting
from google.colab import drive
import google.generativeai as genai #client library to interact with Gemini API
from pathlib import Path

#mount Google Drive to access txt files
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully!")

#configuration of variables
GEMINI_API_KEY = 'AIzaSyAVy7rZIIJYb13ChCT_YWZoc3cNwqh3ynk'
MODEL_NAME = 'gemini-1.5-flash'  #for better free tier compatibility but lower accuracy
INPUT_FOLDER = '/content/drive/MyDrive/API_experiment'
OUTPUT_FOLDER_NAME = 'AI_outputs'

#configure Gemini (which API key to use for all requests)
genai.configure(api_key=GEMINI_API_KEY)

#initialize the model
model = genai.GenerativeModel(MODEL_NAME)

#definition of the prompt template
PROMPT_TEMPLATE = """### GENERAL INSTRUCTIONS ###
You are an expert in text encoding. Your task is to encode the text inside the txt files according to the Text Encoding Initiative (TEI) P5 guidelines: https://tei-c.org/release/doc/tei-p5-doc/en/html/index.html.

### DETAILED INSTRUCTIONS ###
1. Classify the genre of the input text (ex. drama, poetry, letter, legal text, etc.) and encode it using the appropriate TEI tags for the identified genre.
2. Identify and encode semantic elements, such as but not limited to: quotes, measures, notes, dates, numbers.
3. Perform Named Entity Recognition (NER) for people, places and organizations.
4. Encode only the content of the <text> element.
5. Preserve the original language and structure of the input text.
6. Output a well-formed, properly indented encoding in a downloadable xml file.

### TEXT TO ENCODE ###
{text_content}""" #The {text_content} placeholder will be replaced with the file’s text later via .format(...)

#definition of the processing function
def process_text_file(file_path, output_folder):
    """Process a single text file and generate TEI XML output."""
    try:
        #read the input file (the with statement automatically closes the file without calling close())
        with open(file_path, 'r', encoding='utf-8') as f: #assigning the opened file object to the variable f
            text_content = f.read()

        print(f"Processing: {os.path.basename(file_path)}")
        print(f"Text length: {len(text_content)} characters")

        #inserts the files' texts into the placeholder of the template
        full_prompt = PROMPT_TEMPLATE.format(text_content=text_content)

        #send request to Gemini API
        response = model.generate_content(
            full_prompt,
            #fine tuning how Gemini generates text
            generation_config=genai.types.GenerationConfig(
                temperature=0, #for more precise (deterministic) outputs, as for an encoding it is needed acuracy and structure (range between 0 and 1, where 1 is creativity)
                max_output_tokens=8192,  #Gemini allows up to 8192 tokens for each request (file), which roughly correspond to 6000 words
            )
        )

        #extract the response content (the TEI encoding)
        xml_output = response.text

        #saving results (generate output filename from the original txt file)
        input_filename = Path(file_path).stem  #gets filename without extension
        output_filename = f"AI_{input_filename}.xml"
        output_path = os.path.join(output_folder, output_filename) #concatenates the output directory and the new filename into a full path

        #save the XML output
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(xml_output)

        print(f"Successfully processed and saved: {output_filename}")
        return True

    except Exception as e:
        print(f"Error processing {os.path.basename(file_path)}: {str(e)}")
        return False #keeps track of any fails and prints it on screen

def main():
    """Main function to process all txt files."""
    print("="*60)
    print("TEI XML Encoding Script Starting... (Using Gemini Flash API)")
    print("="*60)

    #check if input folder exists
    if not os.path.exists(INPUT_FOLDER):
        print(f"Error: Input folder '{INPUT_FOLDER}' does not exist!")
        return

    #create output folder
    output_folder = os.path.join(INPUT_FOLDER, OUTPUT_FOLDER_NAME)
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder created/confirmed: {output_folder}")

    #find all txt files in the input folder
    txt_files = glob.glob(os.path.join(INPUT_FOLDER, "*.txt"))

    if not txt_files:
        print(f"No .txt files found in {INPUT_FOLDER}")
        return

    print(f"Found {len(txt_files)} txt file(s):")
    for file in txt_files:
        print(f"  - {os.path.basename(file)}")

    print("\n" + "="*60)
    print("Starting processing...")
    print("="*60)

    #processing files
    successful = 0
    failed = 0

    for txt_file in txt_files:
        if process_text_file(txt_file, output_folder):
            successful += 1
        else:
            failed += 1
        print("-" * 40)

        #add delay between requests to avoid rate limiting
        #adds a 10-second pause after each file except the last, to reduce the chance of hitting API rate limits
        if txt_file != txt_files[-1]: #checks if it is the last itaration
            print("Waiting 10 seconds before next file...")
            time.sleep(10)

    #summary
    print("\n" + "="*60)
    print("PROCESSING COMPLETE!")
    print("="*60)
    print(f"Successfully processed: {successful} files")
    print(f"Failed: {failed} files")
    print(f"Output files saved in: {output_folder}")

    #list output files
    xml_files = glob.glob(os.path.join(output_folder, "*.xml"))
    if xml_files:
        print("\nGenerated XML files:")
        for xml_file in xml_files:
            print(f"  - {os.path.basename(xml_file)}")

#run the script (checks if the code is being run directly or if it was imported)
if __name__ == "__main__":
    main() #the main() function will be run only if the code is run directly

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!
TEI XML Encoding Script Starting... (Using Gemini Flash API)
Output folder created/confirmed: /content/drive/MyDrive/API_experiment/AI_outputs
Found 3 txt file(s):
  - Letter_frida.txt
  - Manuscript_bufalini.txt
  - Letter_west.txt

Starting processing...
Processing: Letter_frida.txt
Text length: 570 characters
✓ Successfully processed and saved: AI_Letter_frida.xml
----------------------------------------
Waiting 10 seconds before next file...
Processing: Manuscript_bufalini.txt
Text length: 1190 characters
✓ Successfully processed and saved: AI_Manuscript_bufalini.xml
----------------------------------------
Waiting 10 seconds before next file...
Processing: Letter_west.txt
Text length: 2102 characters
✓ Successfully processed and saved: AI_Letter_west.xml
----------------------------------------
