In [None]:
#imports
!pip -q install -U google-generativeai

import os
import re
from typing import List, Dict, Tuple #for type hints
import pandas as pd
import google.generativeai as genai
from google.colab import drive

#definition of folders
drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/API_classification_training"
ORIG_DIR = os.path.join(BASE_DIR, "Original_corpus")
TRAIN_DIR = os.path.join(BASE_DIR, "Training_set")

#API configuration
with open('/content/drive/MyDrive/gemini_api_key.txt') as f:
  GEMINI_API_KEY=f.read()

MODEL_NAME = "gemini-1.5-flash"

#FUNCTIONS
#this function gets the first corpus to classify
def read_text_corpus(folder: str) -> List[Tuple[int, str, str]]:
    """Read .txt files named like 'Text_<n>.txt' from a folder and puts them in order."""
    assert os.path.isdir(folder), f"Folder not found: {folder}"
    entries = []
    #only considers txt files
    for fname in os.listdir(folder):
        if not fname.lower().endswith('.txt'):
            continue
        #extract numeric index
        m = re.match(r"Text[_\s-]*(\d+)\.txt$", fname, re.IGNORECASE)
        if not m:
            m = re.search(r"(\d+)", fname)
        if not m:
            continue
        idx = int(m.group(1))
        with open(os.path.join(folder, fname), 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read().strip()
        entries.append((idx, fname, content))
    #sorts by index
    entries.sort(key=lambda x: x[0])
    return entries

#this function takes the contnet of the text files and returns it within the charcaters' limit
def textwrap_truncate(text: str, max_chars: int = 1000) -> str:
    """Truncate long texts to avoid hitting context limits while preserving signal."""
    if len(text) <= max_chars:
        return text
    #both the beginning and the end of the text are usually the ones the contain the most important features of a text (ex. "dear Francesco")
    head = text[: max_chars // 2]
    tail = text[-max_chars // 2 :]
    return head + "\n[... truncated ...]\n" + tail

def build_classification_prompt(corpus_entries: List[Tuple[int, str, str]]) -> str:
    """Construct the classification prompt (only letters/poems)."""
    header = (
        "You are a classifier. You will receive a corpus of texts, which are part\n"
        "of these textual categories: poem, letter.\n\n"
        "Your task is to classify the corpus, assigning each\n"
        "text to its corresponding category.\n"
        "Return *only* one prediction per line in the format:\n"
        "<number>. <predicted_category>\n\n"
        "Corpus:"
    )
    body_lines = []
    for idx, fname, content in corpus_entries:
        body_lines.append(f"{idx}. [filename: {fname}]\n{textwrap_truncate(content, 900)}")
    return header + "\n\n" + "\n\n".join(body_lines)

def parse_line_predictions(raw: str) -> Dict[int, str]:
    """Parse lines of the form 'n. label' or 'n: label'."""
    preds = {}
    for line in raw.splitlines():
        line = line.strip()
        if not line:
            continue
        #it takes both "1: letter" and "1. letter"
        m = re.match(r"(\d+)\s*\.?\s*:\s*(\w+)|^(\d+)\s*\.?\s*(\w+)$", line, re.IGNORECASE)
        if m:
            if m.group(1) and m.group(2):
                idx = int(m.group(1))
                label = m.group(2)
            else:
                idx = int(m.group(3))
                label = m.group(4)
            preds[idx] = label.lower()
        #handles lomger outputs: it takes the last word so even if the output is "1. This looks like a letter" it takes the last word only
        else:
            if "." in line:
                left, right = line.split(".", 1)
                if left.strip().isdigit():
                    preds[int(left.strip())] = right.split()[-1].lower()
    return preds

#calculating how well the model did in classifying the texts
def compute_accuracy(preds: Dict[int, str], truth: Dict[int, str]) -> float:
    correct = 0
    total = len(truth)
    for i, gold in truth.items():
        if preds.get(i, "").lower() == gold.lower():
            correct += 1
    return correct / total if total else 0.0

#it builds a table of the results of the classifications and later used for the summary in csv
def df_for_results(corpus_entries, truth, preds1):
    rows = []
    for idx, fname, content in corpus_entries:
        rows.append({
            "index": idx,
            "filename": fname,
            "text": textwrap_truncate(content, 200),
            "ground_truth": truth.get(idx, ""),
            "first_pred": preds1.get(idx, ""),
            "correct_first": preds1.get(idx, "") == truth.get(idx, ""),
        })
    df = pd.DataFrame(rows).sort_values("index").reset_index(drop=True)
    return df

#initialize Gemini Chat for memory reasons
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)
chat = model.start_chat(history=[])

#definition of ground truth
GROUND_TRUTH = {
    1: "letter",
    2: "letter",
    3: "letter",
    4: "poem",
    5: "poem",
}

GROUND_TRUTH_TRAINING = {
    1: "letter",
    2: "poem",
    3: "poem",
    4: "poem",
    5: "letter",
    6: "letter",
    7: "letter",
}

#load corpora
original_entries = read_text_corpus(ORIG_DIR)

#check if number of texts matches ground truth size
if len(original_entries) != len(GROUND_TRUTH):
    print(
        f"⚠️ WARNING: Corpus has {len(original_entries)} texts but ground truth "
        f"defines {len(GROUND_TRUTH)} labels. Results may be inconsistent."
    )
else:
    print(f"✅ Corpus and ground truth match ({len(original_entries)} texts).")


training_entries = read_text_corpus(TRAIN_DIR)
if not training_entries:
    print("WARNING: No files found in Training set.")

#classification of the first corpus
init_prompt = build_classification_prompt(original_entries)
print("\n=== Sending initial classification prompt to Gemini ===\n")
resp_a = chat.send_message(init_prompt)
print("Gemini predictions (first pass):\n")
print(resp_a.text)

preds_first = parse_line_predictions(resp_a.text)
acc_first = compute_accuracy(preds_first, GROUND_TRUTH)
print(f"\n[Python] First-pass accuracy: {acc_first:.3f} ({sum(preds_first.get(i,'')==GROUND_TRUTH[i] for i in GROUND_TRUTH)}/{len(GROUND_TRUTH)})\n")

#accuracy calculation also by Gemini
truth_lines = "\n".join([f"{i}. {lab}" for i, lab in sorted(GROUND_TRUTH.items())])
feedback_prompt = (
    "You are provided with the ground truth for the corpus. Compare it with your predictions and "
    "calculate accuracy as (correct/total). Return only the numeric accuracy and a brief breakdown.\n\n"
    f"Ground truth:\n{truth_lines}"
)
print("=== Asking Gemini to compute accuracy against ground truth ===\n")
resp_b = chat.send_message(feedback_prompt)
print(resp_b.text)

#second classification: guided prompt for the second corpus
if training_entries:
    guided_prompt = (
        "You are a classifier. You will receive a corpus of texts, which are part\n"
        "of these textual categories: poem, letter.\n\n"
        "Your task is to classify the corpus, assigning each\n"
        "text to its corresponding category.\n"
        "Some letters and poems do not follow the standard structure. I will provide an example\n"
        "of a letter and a poem that have a non standard structure.\n\n"
        "Example 1 (Letter, non-standard):\n"
        "\"Ciascuno stia sottomesso alle autorità costituite; poiché non c'è autorità se non da Dio e quelle che esistono sono stabilite da Dio. Quindi chi si oppone all'autorità, si oppone all'ordine stabilito da Dio. E quelli che si oppongono si attireranno addosso la condanna. I governanti infatti non sono da temere quando si fa il bene, ma quando si fa il male. Vuoi non aver da temere l'autorità? Fa' il bene e ne avrai lode, poiché essa è al servizio di Dio per il tuo bene. Ma se fai il male, allora temi, perché non invano essa porta la spada; è infatti al servizio di Dio per la giusta condanna di chi opera il male. Perciò è necessario stare sottomessi, non solo per timore della punizione, ma anche per ragioni di coscienza. Per questo dunque dovete pagare i tributi, perché quelli che sono dediti a questo compito sono funzionari di Dio. Rendete a ciascuno ciò che gli è dovuto: a chi il tributo, il tributo; a chi le tasse le tasse; a chi il timore il timore; a chi il rispetto, il rispetto.\n\n"
        "Non abbiate alcun debito con nessuno, se non quello di un amore vicendevole; perché chi ama il suo simile ha adempiuto la legge. Infatti il precetto: Non commettere adulterio, non uccidere, non rubare, non desiderare e qualsiasi altro comandamento, si riassume in queste parole: Amerai il prossimo tuo come te stesso. L'amore non fa nessun male al prossimo: pieno compimento della legge è l'amore.\n\n"
        "Questo voi farete, consapevoli del momento: è ormai tempo di svegliarvi dal sonno, perché la nostra salvezza è più vicina ora di quando diventammo credenti. La notte è avanzata, il giorno è vicino. Gettiamo via perciò le opere delle tenebre e indossiamo le armi della luce. Comportiamoci onestamente, come in pieno giorno: non in mezzo a gozzoviglie e ubriachezze, non fra impurità e licenze, non in contese e gelosie. Rivestitevi invece del Signore Gesù Cristo e non seguite la carne nei suoi desideri.\"\n\n"
        "Example 2 (Poem, non-standard):\n"
        "\"Di che reggimento siete\nfratelli?\n\nParola tremante\nnella notte\n\nFoglia appena nata\n\nNell'aria spasimante\ninvolontaria rivolta\ndell'uomo presente alla sua\nfragilità\n\nFratelli\"\n\n"
        "Based on these examples, classify the following corpus. Return *only* one prediction per line in the format:\n"
        "<number>. <predicted_category>\n\n"
    )
    guided_prompt += build_classification_prompt(training_entries)
    print("\n=== Guided classification on 'Training set' ===\n")
    resp_c = chat.send_message(guided_prompt)
    print("Gemini predictions on Training set:\n")
    print(resp_c.text)
else:
    print("\n[Skipped] No Training set files to classify.\n")

#parse predictions and compute accuracy for Training set
preds_training = parse_line_predictions(resp_c.text)
acc_training = compute_accuracy(preds_training, GROUND_TRUTH_TRAINING)
print(f"\n[Python] Training-set accuracy: {acc_training:.3f} "
      f"({sum(preds_training.get(i,'')==GROUND_TRUTH_TRAINING[i] for i in GROUND_TRUTH_TRAINING)}/"
      f"{len(GROUND_TRUTH_TRAINING)})\n")

#build results table for Training set
result_df_training = df_for_results(training_entries, GROUND_TRUTH_TRAINING, preds_training)

#summary
result_df_original = df_for_results(original_entries, GROUND_TRUTH, preds_first)
if training_entries:
    combined_df = pd.concat(
        [
            result_df_original.assign(corpus="Original"),
            result_df_training.assign(corpus="Training")
        ],
        ignore_index=True
    )
else:
    combined_df = result_df_original.assign(corpus="Original")

print("\n=== Tabular summary (head) ===\n")
display(combined_df)

out_csv = "/content/drive/MyDrive/API_classification_training/gemini_classification_summary.csv"
combined_df.to_csv(out_csv, index=False)
print(f"\nSaved CSV to: {out_csv}\n")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.2 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.2 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.2 which is incompatible.[0m[31m
[0mMounted at /content/drive
✅ Corpus and ground truth match (5 texts).

=== Sending initial classification prompt to Gemini ===

Gemini predictions (first pass):

1. letter
2. poem
3. letter
4. poem
5. poem


[Python] First-pass accuracy: 0.800 (4/5)

=== Asking Gemini

Unnamed: 0,index,filename,text,ground_truth,first_pred,correct_first,corpus
0,1,Text_1.txt,Washington. D. C.\nMay 11th /73\nMy Dear Siste...,letter,letter,True,Original
1,2,Text_2.txt,"Diego.\nVerdad es, muy grande que yo no quisie...",letter,poem,False,Original
2,3,Text_3.txt,"Temple, Sep. 28. 1739.\nIf wishes could turn t...",letter,letter,True,Original
3,4,Text_4.txt,"In vain to me the smiling mornings shine,\nAnd...",poem,poem,True,Original
4,5,Text_5.txt,Spesso il male di vivere ho incontrato:\nera i...,poem,poem,True,Original
5,1,Text_1.txt,Mi Diego:\n\nEspejo de la noche.\nTus ojos esp...,letter,poem,False,Training
6,2,Text_2.txt,M'illumino\nd'immenso.,poem,poem,True,Training
7,3,Text_3.txt,"Those hours, that with gentle work did frame\n...",poem,poem,True,Training
8,4,Text_4.txt,Erano i capei d’oro a l’aura sparsi\nche ’n mi...,poem,poem,True,Training
9,5,Text_5.txt,NEULINGGASSE 26/14\n1030 WIEN\nTEL. 73 94 015\...,letter,letter,True,Training



Saved CSV to: /content/drive/MyDrive/API_classification_training/gemini_classification_summary.csv

