## **1. Init**

In [None]:
import json

INPUT_FOLDER = "." # TODO: Fill the input path

train_file = f"{INPUT_FOLDER}/train_data.json" # TODO: Change the train file if needed
train_data = []

with open(train_file, 'r') as file:
    for line in file:
        try:
            json_obj = json.loads(line)
            train_data.append(json_obj)
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON: {line.strip()}")

## **2. Pre-processing data**

### **Delete unrelated data**

In [None]:
modified_train_data = []

for json_obj in train_data:
    if 'articles' in json_obj:
        for article in json_obj['articles']:
            if 'article_url' in article:
                del article['article_url']
            if 'entity_list' in article:
                del article['entity_list']
            if 'caption_modified' in article:
                del article['caption_modified']
    if 'maskrcnn_bboxes' in json_obj:
        del json_obj['maskrcnn_bboxes']

    modified_train_data.append(json_obj)

### **Normalize data**

In [None]:
import re

for data_dict in modified_train_data:
    if 'articles' in data_dict:
        for article in data_dict['articles']:
            if 'caption' in article:
                article['caption'] = re.sub(r'[^a-zA-Z0-9.\s]', '', article['caption'])

### **Remove duplicate data**

In [None]:
for data_dict in modified_train_data:
    if 'articles' in data_dict:
        unique_captions = set()
        new_articles = []
        for article in data_dict['articles']:
            if 'caption' in article and article['caption'] not in unique_captions:
                unique_captions.add(article['caption'])
                new_articles.append(article)
        data_dict['articles'] = new_articles

In [None]:
all_image_path = []

for data_dict in modified_train_data:
    img_path = data_dict['img_local_path']
    all_image_path.append(img_path)

In [None]:
all_captions = []

for data_dict in modified_train_data:
    sub_captions = []
    if 'articles' in data_dict:
        for article in data_dict['articles']:
            if 'caption' in article:
                sub_captions.append(article['caption'])
    all_captions.append(sub_captions)



### **Create pairs and labels**

In [None]:
pairs = []
labels = []

for i, item in enumerate(modified_train_data):
    image_path = item['img_local_path']
    captions = [article['caption'] for article in item['articles']]

    for caption in all_captions[i]:
        pair = [image_path, caption]
        pair.append("NOOC")
        labels.append("NOOC")
        pairs.append(pair)

    next_index = (i + 1) % len(modified_train_data)
    for caption in all_captions[next_index]:
        pair = [image_path, caption]
        pair.append("OOC")
        labels.append("OOC")
        pairs.append(pair)

### **Add index to pairs**

In [None]:
complete_pairs = []

for i, item in enumerate(pairs):
    image_path = item[0]
    original_caption = item[1]
    label = item[2]

    pair = [i, image_path, original_caption, label]
    complete_pairs.append(pair)

In [None]:
def save_file(pair_list, json_file_path):
    with open(json_file_path, 'w') as json_file:
        json.dump(pair_list, json_file)

    print(f"Saved to {json_file_path}")

In [None]:
save_file(complete_pairs, "pairs.json")

## **3. Generate more captions**

### **Utils**

In [None]:
import json

def read_file(json_file_path):
    with open(json_file_path, 'r') as json_file:
        read_list = json.load(json_file)

    return read_list

In [None]:
def save_file(pair_list, json_file_path):
    with open(json_file_path, 'w') as json_file:
        json.dump(pair_list, json_file)

    print(f"Saved to {json_file_path}")

In [None]:
def remove_duplicate_sublists(input_list):
    seen_items = set()
    unique_list = []

    for sub_list in input_list:
        key = tuple(sub_list[-3:])
        if key not in seen_items:
            unique_list.append(sub_list)
            seen_items.add(key)

    return unique_list

In [None]:
!pip install transformers torch

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

### **3.1 Generate more NOOC pairs**

#### **Chatgpt paraphraser on T5 base**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

In [None]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=2,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

#### **Implement in pairs**

In [None]:
pairs = read_file("pairs.json")

In [None]:
more_nooc_pairs = []

for i, item in enumerate(pairs):
    index = item[0]
    image_path = item[1]
    original_caption = item[2]
    label = item[3]

    if (label == 'NOOC'):
        more_nooc_pairs.append(item)
        gen_list = paraphrase(original_caption)
        for sentences in gen_list:
            pair = [index, image_path, sentences, label]
            more_nooc_pairs.append(pair)

    elif (label == 'OOC'):
        pair = [index, image_path, original_caption, label]
        more_nooc_pairs.append(pair)

In [None]:
non_duplicated_more_nooc_pairs = remove_duplicate_sublists(more_nooc_pairs)
save_file(non_duplicated_more_nooc_pairs, "more_nooc_pairs.json")

### **3.2 Generate more OOC pairs**

#### **GPT to generate paragraph from captions**

In [None]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").to(device)

In [None]:
def generate_paragraph(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=0.9,
        max_length=100,
        pad_token_id=tokenizer.eos_token_id
    )
    gen_text = tokenizer.batch_decode(gen_tokens)[0]

    return gen_text

In [None]:
import re

def preprocess_paragraph(paragraph):
    paragraph = re.sub(r'[^a-zA-Z0-9.\s]', '', paragraph)
    sentences = re.split(r'\.\ (?=[A-Z])|.\n\n', paragraph)
    sentences = [sentence.strip() for sentence in sentences if sentence and len(sentence)>=35]
    return sentences

#### **Implement in pairs**

In [None]:
pairs = read_file("more_nooc_pairs.json")

In [None]:
more_ooc_pairs = []

for i, item in enumerate(pairs):
    index = item[0]
    image_path = item[1]
    original_caption = item[2]
    label = item[3]

    if (label == 'OOC'):
        more_ooc_pairs.append(item)

        gen_paragraph = generate_paragraph(original_caption)
        processed_sentences = preprocess_paragraph(gen_paragraph)

        for sentence in processed_sentences:
            pair = [index, image_path, sentence, label]
            more_ooc_pairs.append(pair)

    elif (label == 'NOOC'):
        pair = [index, image_path, original_caption, label]
        more_ooc_pairs.append(pair)

In [None]:
non_duplicated_more_ooc_pairs = remove_duplicate_sublists(more_ooc_pairs)
save_file(non_duplicated_more_ooc_pairs, "more_ooc_nooc_pairs.json")