## Dataset Processing

In [1]:
from pathlib import Path
import re
import gdown
import os

# Download chapter 7 of Frankenstein from Google Drive
file_id = "1KW7bLHUFZKKbhr0Cqj8SfSoGszRfCF20"
output = "ch7.txt"

if os.path.exists(output):
    print("Text file already exists — skipping download.")
else:
    gdown.download(
        f"https://drive.google.com/uc?id={file_id}",
        output,
        quiet=False
    )
    print(f"Downloaded text file to: {output}")

# Load chapter 7 of Frankenstein
text = Path("ch7.txt").read_text(encoding="utf-8")

# Function to chunk text into overlapping segments
def chunk_text(text, size=800, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + size
        chunks.append(text[start:end])
        start += size - overlap
    return chunks

# Create chunks and output length of chunks
chunks = chunk_text(text)
len(chunks)


Text file already exists — skipping download.


34

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Generate embeddings for each chunk (preprocessing step for chatbot)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

chunk_embeddings = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
chunk_embeddings.shape


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

(34, 384)

In [None]:
import faiss

# Build FAISS index (preprocessing step for chatbot)
# Adds relevant chunks based on user query
dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(chunk_embeddings)


In [None]:
# Generation step 1: Compress relevant chunks into a summary (format for Gemma model input)
def compress_chunks(model, tokenizer, chunk_list):
    joined = "\n\n".join(chunk_list)

    prompt = (
        "Summarize the following text. Preserve important characters, events, "
        "and plot details that may help answer a future question.\n\n"
        f"{joined}\n\nSummary:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cpu")
    # Generate summary with chosen hyperparameters
    output = model.generate(
        **inputs,
        max_new_tokens=800,
        temperature=0.2
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)


## Model Construction

In [43]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [44]:
from huggingface_hub import whoami
print(whoami())


{'type': 'user', 'id': '691de23843ce34b543fd62e2', 'name': 'Thejass', 'fullname': 'Thejaswin', 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/oq5hRpUZgGxbvmHnSGgs1.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'frankensteinChatbot', 'role': 'fineGrained', 'createdAt': '2025-11-21T20:15:53.956Z', 'fineGrained': {'canReadGatedRepos': True, 'global': [], 'scoped': [{'entity': {'_id': '669650bb11dbbf600cf4dcf0', 'type': 'model', 'name': 'google/gemma-2-2b-it'}, 'permissions': ['repo.content.read']}, {'entity': {'_id': '691de23843ce34b543fd62e2', 'type': 'user', 'name': 'Thejass'}, 'permissions': ['repo.content.read']}]}}}}


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load Gemma model and tokenizer
model_name = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="cpu"
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# def retrieve(query, k=3):
#     q_emb = embedder.encode([query], convert_to_numpy=True)
#     distances, indices = index.search(q_emb, k)
#     return [chunks[i] for i in indices[0]]
# ^^ part of hyperparameter tuning ---

# A new version of retrieve() than allows for larger tokens:
def retrieve_large(query, k=50):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(q_emb, k)
    return [chunks[i] for i in indices[0]]


In [None]:
def generate_answer(query):
    # Get chunks relevant to the query
    retrieved_chunks = retrieve_large(query, k=50)

    # Compress the chunks for model input
    compressed_context = compress_chunks(model, tokenizer, retrieved_chunks)

    # Give the model the compressed context and the user query
    prompt = (
        "You are a helpful assistant answering questions about Mary Shelley's *Frankenstein*.\n"
        "Use ONLY the following context:\n\n"
        f"{compressed_context}\n\n"
        f"Question: {query}\nAnswer:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cpu")
    # hyperparameter tuning on model generation ---
    output = model.generate(
        **inputs,
        max_new_tokens=400,
        temperature=0.4
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)



### Example Dialog:
Question: What kind of emotions were expressed in these letters?
Answer: The letters express a range of emotions, including:

* **Grief and sorrow:** Victor is deeply saddened by the loss of his brother and the injustice done to Justine.
* **Anger and resentment:** Victor is angry at the murderer and at the world for the suffering he has caused.
* **Fear and anxiety:** Victor is constantly haunted by the memory of his creation and the fear of its potential for destruction.
* **Hope and determination:** Victor is determined to clear Justine's name and expose the true killer.
* **Guilt and remorse:** Victor feels guilty for creating the monster and for the suffering it has caused.


In [None]:
def chat():
    print("Frankenstein Chatbot — type 'exit' to quit\n")

    while True:
        user = input("You: ")
        if user.lower() in {"exit", "quit"}:
            break

        answer = generate_answer(user)
        print("\nBot:", answer, "\n")
    
    return answer

# Call chatbot and store last answer for evaluation
candidate_summary = chat()

Frankenstein Chatbot — type 'exit' to quit



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Bot: You are a helpful assistant answering questions about Mary Shelley's *Frankenstein*.
Use ONLY the following context:

Summarize the following text. Preserve important characters, events, and plot details that may help answer a future question.

of peace and gentleness, that will heal, instead of
festering, the wounds of our minds. Enter the house of mourning, my
friend, but with kindness and affection for those who love you, and not
with hatred for your enemies.

“Your affectionate and afflicted father,

“Alphonse Frankenstein.



“Geneva, May 12th, 17—.”



Clerval, who had watched my countenance as I read this letter, was
surprised to observe the despair that succeeded the joy I at first
expressed on receiving news from my friends. I threw the letter on the
table, and covered my face with my hands.

“My dear Frankenstein,” exclaimed Henry, when he perceived me
weep with bitterness, “are you always to be unhappy? My dear friend,
what has happened?”

I motioned him to take up the

### Question 1: What kind of emotions were expressed in these letters?
Reference Answer 1: The letters Victor receives from his father are filled with grief, shock, and deep sorrow. Alphonse Frankenstein writes with a tone of devastation after the murder of William, and his words express confusion, despair, and anguish over the tragedy that has struck their family. The letter also conveys a sense of urgency and emotional collapse, as the family is overwhelmed by mourning and disbelief at the loss of such a young and beloved child. Overall, the emotions expressed in the letters are intense sadness, heartbreak, and a desperate longing for comfort during a moment of profound family crisis.

### Question 2: Who was accused of William’s murder?
Reference Answer 2: Justine Moritz, the family’s gentle and loyal servant, is accused of murdering William. Evidence seems to point toward her when a locket belonging to William is found in her pocket. However, it is understood that the reader and Victor know she is innocent.

In [None]:
from rouge_score import rouge_scorer

# Evaluate the model's answer against the reference summary
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

reference_summary = "The letters Victor receives from his father are filled with grief, shock, and deep sorrow. Alphonse Frankenstein writes with a tone of devastation after the murder of William, and his words express confusion, despair, and anguish over the tragedy that has struck their family. The letter also conveys a sense of urgency and emotional collapse, as the family is overwhelmed by mourning and disbelief at the loss of such a young and beloved child. Overall, the emotions expressed in the letters are intense sadness, heartbreak, and a desperate longing for comfort during a moment of profound family crisis."
scores = scorer.score(reference_summary, candidate_summary)
for key in scores:
    print(f'{key}: {scores[key]}')