In [None]:
!pip install -U \
    llama-index \
    llama-index-llms-gemini \
    google-generativeai \
    pandas \
    tenacity \
    nltk \
    spacy \
    scikit-learn \
    matplotlib \
    seaborn

Collecting llama-index
  Downloading llama_index-0.14.13-py3-none-any.whl.metadata (13 kB)
Collecting llama-index-llms-gemini
  Downloading llama_index_llms_gemini-0.6.2-py3-none-any.whl.metadata (690 bytes)
Collecting pandas
  Downloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting llama-index-cli<0.6,>=0.5.0 (from llama-index)
  Downloading llama_index_cli-0.

In [None]:
import os
import time
import pandas as pd
from llama_index.llms.gemini import Gemini


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


In [None]:
GOOGLE_API_KEY = ""

In [None]:
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
# genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
model = Gemini(
    model="gemini-3-flash-preview",
    api_key=GOOGLE_API_KEY
)

print("Gemini initialized successfully.")

  model = Gemini(


Gemini initialized successfully.


In [None]:
TOPICS = [
    "Individual vs Society",
    "Moral Judgment and Interpretation of Character",
    "Isolation, Withdrawal, and Inner Conflict"
]

PARAGRAPHS_PER_TOPIC = 20   
PARAS_PER_CALL = 4        

In [None]:
GENERIC_BATCH_PROMPT = """
Write {n} independent paragraphs, each 100–200 words long, on the topic:

"{topic}"

Requirements:
- Neutral literary prose
- No headings or lists
- Separate paragraphs using <PARA>

Output only the paragraphs.
"""

AUSTEN_BATCH_PROMPT = """
Write {n} independent paragraphs, each 100–200 words long, on the topic:

"{topic}"

Write in a style inspired by Jane Austen:
- refined sentence structure
- subtle irony
- social observation

Constraints:
- No references to specific works or characters
- Separate paragraphs using <PARA>

Output only the paragraphs.
"""

In [None]:
CLASS2_FILE = "ai_generic_checkpoint.csv"
CLASS3_FILE = "ai_styled_austen_checkpoint.csv"

In [None]:
import os
import pandas as pd
import time

def load_existing(file):
    if os.path.exists(file):
        return pd.read_csv(file).to_dict("records")
    return []

def save_checkpoint(data, file):
    pd.DataFrame(data).to_csv(file, index=False)
    print(f"[Checkpoint saved → {file}]  Total rows: {len(data)}")

def split_paragraphs(text):
    return [p.strip() for p in text.split("<PARA>") if p.strip()]

def valid_length(text, min_words=100, max_words=200):
    wc = len(text.split())
    return min_words <= wc <= max_words

In [None]:
def generate_text(prompt: str) -> str:
    response = model.complete(prompt)

    if hasattr(response, "text") and response.text:
        return response.text.strip()

    if hasattr(response, "raw") and response.raw:
        try:
            parts = response.raw["candidates"][0]["content"]["parts"]
            texts = [p.get("text", "") for p in parts if "text" in p]
            combined = "\n".join(texts).strip()
            if combined:
                return combined
        except Exception:
            pass

    raise RuntimeError("Gemini returned no usable text")

In [None]:
class2_data = load_existing(CLASS2_FILE)

for topic in TOPICS:
    collected = sum(1 for r in class2_data if r["topic"] == topic)
    print(f"\n[Class 2] Topic: {topic} | Already collected: {collected}")

    while collected < PARAGRAPHS_PER_TOPIC:
        prompt = GENERIC_BATCH_PROMPT.format(
            topic=topic,
            n=PARAS_PER_CALL
        )

        try:
            text = generate_text(prompt)
        except Exception as e:
            print("Generation failed, stopping for now:", e)
            break  

        paras = split_paragraphs(text)

        print(f"\n--- Generated batch for '{topic}' ---")

        for p in paras:
            if valid_length(p):
                print("\n[PARA]\n", p[:300], "...\n")  
                class2_data.append({
                    "text": p,
                    "class": "ai_generic",
                    "topic": topic,
                    "target_author": None
                })
                collected += 1

            if collected >= PARAGRAPHS_PER_TOPIC:
                break

        save_checkpoint(class2_data, CLASS2_FILE)
        time.sleep(5)   


[Class 2] Topic: Individual vs Society | Already collected: 20

[Class 2] Topic: Moral Judgment and Interpretation of Character | Already collected: 8

--- Generated batch for 'Moral Judgment and Interpretation of Character' ---

[PARA]
 When assessing the moral fiber of a literary figure, the observer often finds themselves caught between the character’s stated intentions and the tangible consequences of their actions. This discrepancy creates a fertile ground for interpretation, as a single gesture may be viewed as either a noble  ...


[PARA]
 The allure of complex narratives often lies in the deliberate ambiguity of their protagonists, where moral clarity is sacrificed for psychological realism. When an author refuses to provide a clear ethical compass, the burden of interpretation shifts entirely to the audience, transforming the act of ...


[PARA]
 Interpretation of character is inextricably linked to the cultural and temporal context in which a work is received, as the virtues

In [None]:
class3_data = load_existing(CLASS3_FILE)

for topic in TOPICS:
    collected = sum(1 for r in class3_data if r["topic"] == topic)
    print(f"\n[Class 3 – Austen] Topic: {topic} | Already collected: {collected}")

    while collected < PARAGRAPHS_PER_TOPIC:
        prompt = AUSTEN_BATCH_PROMPT.format(
            topic=topic,
            n=PARAS_PER_CALL
        )

        try:
            text = generate_text(prompt)
        except Exception as e:
            print("Generation failed, stopping for now:", e)
            break

        paras = split_paragraphs(text)

        print(f"\n--- Generated Austen batch for '{topic}' ---")

        for p in paras:
            if valid_length(p):
                print("\n[PARA]\n", p[:300], "...\n")
                class3_data.append({
                    "text": p,
                    "class": "ai_styled",
                    "topic": topic,
                    "target_author": "Jane Austen"
                })
                collected += 1

            if collected >= PARAGRAPHS_PER_TOPIC:
                break

        save_checkpoint(class3_data, CLASS3_FILE)
        time.sleep(5)


[Class 3 – Austen] Topic: Individual vs Society | Already collected: 0

--- Generated Austen batch for 'Individual vs Society' ---

[PARA]
 The individual who dares to possess a mind of their own often finds it a most inconvenient possession. Society, in its infinite wisdom, prefers a uniform smoothness of character, much like a well-rolled gravel path where no pebble is permitted to stand higher than its neighbor. To have a particular  ...


[PARA]
 There is no tribunal so vigilant, nor so entirely convinced of its own infallibility, as a small circle of one’s acquaintances. A person may imagine their private thoughts to be their own, yet they shall soon discover that every motive has been weighed, measured, and found wanting by the collective  ...


[PARA]
 The struggle between personal inclination and social obligation is a contest in which the latter, being better armed with the weapons of tradition and censure, almost invariably triumphs. We are taught from the cradle that our fi

In [None]:
import pandas as pd
import re
df = pd.read_csv("ai_styled_austen_checkpoint.csv")

print("Initial size:", len(df))

Initial size: 60


In [None]:
def clean_text_light(text):
    # remove stray delimiters if any
    text = text.replace("<PARA>", " ")

    # normalize whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

df["text"] = df["text"].apply(clean_text_light)

In [None]:
def word_count(text):
    return len(text.split())

df["word_count"] = df["text"].apply(word_count)

df = df[(df["word_count"] >= 100) & (df["word_count"] <= 200)]

print("After length filtering:", len(df))

After length filtering: 60


In [None]:
print(df.groupby("topic").size())
print(df["word_count"].describe())

topic
Individual vs Society                             20
Isolation, Withdrawal, and Inner Conflict         20
Moral Judgment and Interpretation of Character    20
dtype: int64
count     60.000000
mean     169.866667
std        9.204764
min      147.000000
25%      163.000000
50%      170.500000
75%      176.000000
max      188.000000
Name: word_count, dtype: float64


In [None]:
df = df.drop(columns=["word_count"])

df.to_csv("ai_generic_final.csv", index=False)

print("Saved ai_generic_final.csv")

Saved ai_generic_final.csv


In [None]:
from google.colab import files
files.download("ai_generic_final.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>