# Animal Verification MVP

This notebook demonstrates a pipeline that:
1. Extracts animal mentions from text using NER (with negation handling)
2. Classifies animals in images
3. Verifies whether the text claim matches the image

The goal is to detect animal claims in user text and check their correctness against the image.

In [None]:
# support imports
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

## 1️⃣ Generate data for NER model

In [None]:
!python ../utils/generate_ner_data.py

## 2️⃣ Data Exploration (NER)

In [None]:
import json

# Load data
with open("../data/ner/train.json", encoding="utf-8") as f:
    train_data = json.load(f)
with open("../data/ner/val.json", encoding="utf-8") as f:
    val_data = json.load(f)

# Display sample
train_data[0]

In [None]:
from collections import Counter

import matplotlib.pyplot as plt

# Sentence Length Histogram
sentence_lengths = [len(sample["tokens"]) for sample in train_data]

plt.figure(figsize=(8, 5))
plt.hist(
    sentence_lengths,
    bins=range(0, max(sentence_lengths) + 2, 1),
    color="skyblue",
    edgecolor="black",
)
plt.title("Distribution of Sentence Lengths")
plt.xlabel("Number of Tokens")
plt.ylabel("Count")
plt.show()

In [None]:
# Bar Chart: Samples per Animal
animal_counts = Counter()
for sample in train_data:
    for token, tag in zip(sample["tokens"], sample["ner_tags"]):
        if tag == "B-ANIMAL":
            animal_counts[(token)] += 1

plt.figure(figsize=(10, 5))
plt.bar(
    animal_counts.keys(),
    animal_counts.values(),
    color="lightgreen",
    edgecolor="black",
)
plt.title("Number of Samples per Animal")
plt.xlabel("Animal")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Pie Chart: Sentence Types
types_counts = Counter()
for sample in train_data:
    if any(tag == "B-ANIMAL" for tag in sample["ner_tags"]):
        if any(
            word.lower()
            in ["not", "no", "never", "n't", "without", "none", "nothing"]
            for word in sample["tokens"]
        ):
            types_counts["negation"] += 1
        else:
            types_counts["positive"] += 1
    else:
        types_counts["negative"] += 1

plt.figure(figsize=(6, 6))
plt.pie(
    types_counts.values(),
    labels=types_counts.keys(),
    autopct="%1.1f%%",
    colors=["lightblue", "salmon", "lightgreen"],
)
plt.title("Distribution of Sentence Types")
plt.show()

## 3️⃣ Token & Label Visualization

In [None]:
# Show first 5 token-label pairs
for ex in train_data[:5]:
    print(list(zip(ex["tokens"], ex["ner_tags"])))

## 4️⃣ NER Model Training Overview

In [None]:
!python ../ner/train.py

## 5️⃣ Image Classification Overview

In [None]:
!python ../utils/download_vision_data.py

In [None]:
import os

from matplotlib import pyplot as plt
import matplotlib.image as mpimg

IMAGE_DIR = "../data/animals"
classes = [d for d in os.listdir(IMAGE_DIR) if not d.startswith(".")][
    :5
]  # show first 5 classes
fig, axs = plt.subplots(1, 5, figsize=(15, 3))

for i, cls in enumerate(classes):
    cls_path = os.path.join(IMAGE_DIR, cls)
    img_file = os.listdir(cls_path)[0]
    img = mpimg.imread(os.path.join(cls_path, img_file))
    axs[i].imshow(img)
    axs[i].set_title(cls)
    axs[i].axis("off")
plt.show()

## 6️⃣ Train vision model

In [None]:
!python ../vision/train.py

## 7️⃣ Pipeline Demonstration

In [None]:
from pipeline.pipeline import verify_text_image_claim

# Positive claim
verify_text_image_claim(
    "There is a horse in the picture.", "../test_images/horse/0001.jpeg"
)

# Negated claim
verify_text_image_claim("not a dog", "../test_images/cow/0001.jpeg")
verify_text_image_claim(
    "I don't think it's a sheep", "../test_images/dog/0001.jpeg"
)

# Another example
verify_text_image_claim(
    "Look, a cat over there!", "../test_images/cat/0001.jpeg"
)