In [7]:
from pathlib import Path
import pandas as pd
import json
import ast

project_root = Path.cwd()

# HuggingFace dataset
hf_root = Path("/home/ashish/Desktop/202418007/RAVSG/backend/data/HF")
hf_image_dir = hf_root / "images"
hf_caption_file = hf_root / "captions.json"

# Kaggle dataset
kaggle_root = Path("/home/ashish/Desktop/202418007/RAVSG/backend/data/Kaggle")
kaggle_image_dir = kaggle_root / "images"
kaggle_caption_file = kaggle_root / "captions_large.csv"


with open(hf_caption_file, "r", encoding="utf-8") as f:
    hf_data = json.load(f)

hf_rows = []

# hf_data is a dict: filename -> caption
for img_name, caption in hf_data.items():
    img_path = hf_image_dir / img_name
    if img_path.exists():
        hf_rows.append({
            "image": str(img_path),
            "caption": caption
        })

df_hf = pd.DataFrame(hf_rows)
print("HuggingFace rows:", len(df_hf))

df_kaggle_raw = pd.read_csv(kaggle_caption_file)
kaggle_rows = []

for _, row in df_kaggle_raw.iterrows():
    img_dict = ast.literal_eval(row["image"])
    file = Path(img_dict["path"]).name
    file = file.replace(".png", ".jpg")  # fix extension mismatch
    local_path = kaggle_image_dir / file
    if local_path.exists():
        kaggle_rows.append({
            "image": str(local_path),
            "caption": row["caption"]
        })

df_kaggle = pd.DataFrame(kaggle_rows)
print("Kaggle rows:", len(df_kaggle))


df = pd.concat([df_hf, df_kaggle], ignore_index=True)
print("Total merged rows:", len(df))
print(df.head())

HuggingFace rows: 100
Kaggle rows: 973
Total merged rows: 1073
                                               image  \
0  /home/ashish/Desktop/202418007/RAVSG/backend/d...   
1  /home/ashish/Desktop/202418007/RAVSG/backend/d...   
2  /home/ashish/Desktop/202418007/RAVSG/backend/d...   
3  /home/ashish/Desktop/202418007/RAVSG/backend/d...   
4  /home/ashish/Desktop/202418007/RAVSG/backend/d...   

                                             caption  
0  The image is a Ghibli-style cartoon of a young...  
1  The image features a character from the movie ...  
2  The image features a young woman with reddish-...  
3  The image is a close-up of Howl from the Studi...  
4  The image is a full-body shot of a young woman...  


In [13]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
df.to_csv(project_root + "/data/merged_dataset.csv", index=False)

In [16]:
from core.rag_engine import RAGEngine

rag = RAGEngine()
rag.build_from_csv(batch_size=64)

Building RAG: 100%|██████████| 17/17 [00:04<00:00,  3.78it/s]
