In [76]:
import pathway as pw
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import wandb

In [77]:
wandb.init(project="backstory_consistency", name="track_a_run", mode="offline")
print("W&B initialized in offline mode. You can later sync if you want.")

  | |_| | '_ \/ _` / _` |  _/ -_)


[34m[1mwandb[0m: Detected [openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


W&B initialized in offline mode. You can later sync if you want.


In [78]:
with open("The Count of Monte Cristo.txt", "r", encoding="utf-8") as f:
    monte_content = f.read()

with open("In search of the castaways.txt", "r", encoding="utf-8") as f:
    castaways_content = f.read()

In [79]:
class NovelSchema(pw.Schema):
    book: str
    text: str

monte = pw.debug.table_from_rows(
    rows=[("The Count of Monte Cristo", monte_content)],
    schema=NovelSchema
)

castaways = pw.debug.table_from_rows(
    rows=[("In Search of the Castaways", castaways_content)],
    schema=NovelSchema
)

novels = monte.concat(castaways)

Occurred here:
    Line: monte = pw.debug.table_from_rows(
    File: /tmp/ipython-input-2646755783.py:5
Occurred here:
    Line: castaways = pw.debug.table_from_rows(
    File: /tmp/ipython-input-2646755783.py:10
Occurred here:
    Line: novels = monte.concat(castaways)
    File: /tmp/ipython-input-2646755783.py:15


In [82]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = []
for book, text in [
    ("The Count of Monte Cristo", monte_content),
    ("In Search of the Castaways", castaways_content),
]:
    for c in chunk_text(text):
        chunks.append({"book": book, "chunk": c})

In [83]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode([c["chunk"] for c in chunks], show_progress_bar=True).astype("float32")

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

In [84]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [85]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test (1).csv")

In [97]:
train_df.head()

Unnamed: 0,id,book_name,char,caption,content,label,true_label
0,46,In Search of the Castaways,Thalcave,,Thalcave’s people faded as colonists advanced;...,consistent,1
1,137,The Count of Monte Cristo,Faria,The Origin of His Connection with the Count of...,"Suspected again in 1815, he was re-arrested an...",contradict,0
2,74,In Search of the Castaways,Kai-Koumou,,Before each fight he studied the crack-pattern...,consistent,1
3,109,The Count of Monte Cristo,Noirtier,The Complexity of Family and Personal Life,Villefort’s drift toward the royalists disappo...,contradict,0
4,104,The Count of Monte Cristo,Noirtier,Involvement and Turning Point in the French Re...,His parents were targeted in a reprisal for su...,consistent,1


In [98]:
test_df.head()

Unnamed: 0,id,book_name,char,caption,content
0,95,The Count of Monte Cristo,Noirtier,The Fatal Decision of the Hundred Days,Learning that Villefort meant to denounce him ...
1,136,The Count of Monte Cristo,Faria,Escape and Secret Life,From 1800 onward he lived quietly on a small i...
2,59,In Search of the Castaways,Thalcave,,"Posing as a relay-station hand, he slipped cap..."
3,60,In Search of the Castaways,Thalcave,,First rescue: in 1852 an avalanche buried a si...
4,124,The Count of Monte Cristo,Faria,Foreshadowing of Relationships,On the Marseille quay he noticed young Caderou...


In [99]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [100]:
test_df.head()

Unnamed: 0,id,book_name,char,caption,content
0,95,The Count of Monte Cristo,Noirtier,The Fatal Decision of the Hundred Days,Learning that Villefort meant to denounce him ...
1,136,The Count of Monte Cristo,Faria,Escape and Secret Life,From 1800 onward he lived quietly on a small i...
4,124,The Count of Monte Cristo,Faria,Foreshadowing of Relationships,On the Marseille quay he noticed young Caderou...
5,111,The Count of Monte Cristo,Noirtier,Wisdom and Influence in the Post-Revolution Era,Though bodily strength ebbed he still pulled s...
6,135,The Count of Monte Cristo,Faria,Secret Society and Political Struggle,A failed 1796 coup landed him in a Roman priso...


In [86]:
def retrieve_evidence(book_name, query, k=5):
    q_emb = embed_model.encode([query]).astype("float32")
    _, idxs = index.search(q_emb, k)

    evidence = []
    for i in idxs[0]:
        if chunks[i]["book"].lower() in book_name.lower():
            evidence.append(chunks[i]["chunk"])
    return evidence

In [87]:
generator = pipeline("text-generation", model="gpt2-medium")  # Free local model

def judge_consistency(backstory, evidence):
    prompt = f"""
Backstory:
{backstory}

Evidence from novel:
{' '.join(evidence)}

Is the backstory consistent with the novel?
Answer only: Consistent or Contradict.
"""
    output = generator(prompt, max_length=50, do_sample=False)
    answer = output[0]['generated_text'].splitlines()[-1].strip()
    if "consistent" in answer.lower():
        return "Consistent"
    else:
        return "Contradict"

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [88]:
predictions = []

for _, row in test_df.iterrows():
    evidence = retrieve_evidence(row["book_name"], row["content"])
    decision = judge_consistency(row["content"], evidence)
    label = 1 if decision == "Consistent" else 0
    predictions.append(label)
    # Log each row prediction offline to W&B
    wandb.log({"row_id": row["id"], "prediction": decision})

test_df["label"] = predictions
test_df.to_csv("submission.csv", index=False)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take