In [55]:
import numpy as np
import pandas as pd
import json
import os

# Merge Questions & Annotations JSON files
We will transform JSON files to CSV & only keep relevant information. Then merge the questions & annotations JSON files for both train & validation.

**NOTE**: Run this section for both train & validation JSON files. Comment the train2014 file paths & uncomment the val2014 file paths to run for validation JSON files.

In [10]:
annotations_file_path = "./data/train/v2_mscoco_train2014_annotations.json"
questions_file_path = "./data/train/v2_OpenEnded_mscoco_train2014_questions.json"
# annotations_file_path = "./data/val/v2_mscoco_val2014_annotations.json"
# questions_file_path = "./data/val/v2_OpenEnded_mscoco_val2014_questions.json"

merge_file_path = "./data/train/merged.csv"
one_word_answers_file_path = "./data/train/one_word_answers.csv"
# merge_file_path = "./data/val/merged.csv"
# one_word_answers_file_path = "./data/val/one_word_answers.csv"

## Transform annotations file

In [11]:
# Load the JSON file
with open(annotations_file_path, "r") as file:
    data = json.load(file)

# Print the loaded data
print(data["annotations"][0])

{'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000}


In [12]:
annotations_df = pd.DataFrame(data["annotations"])

In [13]:
annotations_df.head()

Unnamed: 0,question_type,multiple_choice_answer,answers,image_id,answer_type,question_id
0,what is this,net,"[{'answer': 'net', 'answer_confidence': 'maybe...",458752,other,458752000
1,what,pitcher,"[{'answer': 'pitcher', 'answer_confidence': 'y...",458752,other,458752001
2,what color is the,orange,"[{'answer': 'orange', 'answer_confidence': 'ye...",458752,other,458752002
3,is this,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",458752,yes/no,458752003
4,what color is the,white,"[{'answer': 'white', 'answer_confidence': 'yes...",262146,other,262146000


In [14]:
answers_df = pd.json_normalize(annotations_df["answers"])

# Concatenate the normalized 'answers' DataFrame with the original 'annotations_df' DataFrame
annotations_df = pd.concat([annotations_df, answers_df], axis=1)

# Drop the original 'answers' column
annotations_df = annotations_df.drop("answers", axis=1)

In [15]:
annotations_df.head()

Unnamed: 0,question_type,multiple_choice_answer,image_id,answer_type,question_id,0,1,2,3,4,5,6,7,8,9
0,what is this,net,458752,other,458752000,"{'answer': 'net', 'answer_confidence': 'maybe'...","{'answer': 'net', 'answer_confidence': 'yes', ...","{'answer': 'net', 'answer_confidence': 'yes', ...","{'answer': 'netting', 'answer_confidence': 'ye...","{'answer': 'net', 'answer_confidence': 'yes', ...","{'answer': 'net', 'answer_confidence': 'yes', ...","{'answer': 'mesh', 'answer_confidence': 'maybe...","{'answer': 'net', 'answer_confidence': 'yes', ...","{'answer': 'net', 'answer_confidence': 'yes', ...","{'answer': 'net', 'answer_confidence': 'yes', ..."
1,what,pitcher,458752,other,458752001,"{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'catcher', 'answer_confidence': 'no...","{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'pitcher', 'answer_confidence': 'ye...","{'answer': 'pitcher', 'answer_confidence': 'ye..."
2,what color is the,orange,458752,other,458752002,"{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'may...","{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'yes...","{'answer': 'orange', 'answer_confidence': 'yes..."
3,is this,yes,458752,yes/no,458752003,"{'answer': 'yes', 'answer_confidence': 'yes', ...","{'answer': 'yes', 'answer_confidence': 'yes', ...","{'answer': 'yes', 'answer_confidence': 'maybe'...","{'answer': 'yes', 'answer_confidence': 'yes', ...","{'answer': 'yes', 'answer_confidence': 'maybe'...","{'answer': 'no', 'answer_confidence': 'maybe',...","{'answer': 'yes', 'answer_confidence': 'yes', ...","{'answer': 'yes', 'answer_confidence': 'yes', ...","{'answer': 'yes', 'answer_confidence': 'yes', ...","{'answer': 'yes', 'answer_confidence': 'maybe'..."
4,what color is the,white,262146,other,262146000,"{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'...","{'answer': 'white', 'answer_confidence': 'yes'..."


In [16]:
for i in range(10):
    print(annotations_df.iloc[0][i])

{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}
{'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}
{'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}
{'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}
{'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}
{'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}
{'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}
{'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}
{'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}
{'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}


In [17]:
annotations_df.iloc[0][0]["answer"]

'net'

## Only pick first answer (0th one) from top 10 answers

In [18]:
annotations_df["answer"] = annotations_df.apply(lambda row: row[0]["answer"], axis=1)

## Only keep important columns: `image_id`, `question_id`, `answer`

We will merge `annotations_df` with similar processed `questions_df` to get the final dataframe.

In [19]:
annotations_df = annotations_df[["image_id", "question_id", "answer"]]

In [21]:
annotations_df.head()

Unnamed: 0,image_id,question_id,answer
0,458752,458752000,net
1,458752,458752001,pitcher
2,458752,458752002,orange
3,458752,458752003,yes
4,262146,262146000,white


## Transform questions file

In [22]:
# Load the JSON file
with open(questions_file_path, "r") as file:
    data = json.load(file)

# Print the loaded data
print(data["questions"][0])

{'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000}


In [23]:
questions_df = pd.DataFrame(data["questions"])

In [24]:
questions_df.head()

Unnamed: 0,image_id,question,question_id
0,458752,What is this photo taken looking through?,458752000
1,458752,What position is this man playing?,458752001
2,458752,What color is the players shirt?,458752002
3,458752,Is this man a professional baseball player?,458752003
4,262146,What color is the snow?,262146000


In [25]:
questions_df.drop("image_id", axis=1, inplace=True)

In [26]:
questions_df.head()

Unnamed: 0,question,question_id
0,What is this photo taken looking through?,458752000
1,What position is this man playing?,458752001
2,What color is the players shirt?,458752002
3,Is this man a professional baseball player?,458752003
4,What color is the snow?,262146000


## Merge `annotations_df` & `questions_df` on `question_id`

In [27]:
len(annotations_df), len(questions_df)

(443757, 443757)

In [28]:
merged_df = pd.merge(annotations_df, questions_df, on="question_id")

In [29]:
merged_df.head()

Unnamed: 0,image_id,question_id,answer,question
0,458752,458752000,net,What is this photo taken looking through?
1,458752,458752001,pitcher,What position is this man playing?
2,458752,458752002,orange,What color is the players shirt?
3,458752,458752003,yes,Is this man a professional baseball player?
4,262146,262146000,white,What color is the snow?


## Save `merged.csv` file

In [30]:
merged_df.to_csv(merge_file_path, index=False)

# Understanding the question & answers

In [31]:
def is_answer_one_word(answer):
    return len(answer.split()) == 1

## Only some 40,000 questions in train & 20,000 questions in eval have more than one word answers
This is really helpful as our simple BERT + ViT model is able to do classification on some top 1000 answers which are one word. It is not a generative model.

In [33]:
count_one_word = merged_df["answer"].apply(is_answer_one_word).sum()
print("Total number of questions:", len(merged_df))
print("Number of questions with one-word answers:", count_one_word)

Total number of questions: 443757
Number of questions with one-word answers: 397022


# Filter our data from `merged_df` to only keep questions with one word answers

In [34]:
one_word_df = merged_df[merged_df["answer"].apply(is_answer_one_word)]

## Save `one_word_answers.csv` file

In [35]:
one_word_df.to_csv(one_word_answers_file_path, index=False)

# Keep top 1000 answers

As discussed earlier, our BERT + ViT is a simple model that is not generative, and can only do classification on some top 1000 answers. 

**NOTE**: We will generate the answer space from the `one_word_answers.csv` file in a later section: **Generate Answer Space**

In [36]:
train_one_word_answers_file_path = "./data/train/one_word_answers.csv"
val_one_word_answers_file_path = "./data/val/one_word_answers.csv"

filtered_train_file_path = "./data/train/filtered.csv"
filtered_val_file_path = "./data/val/filtered.csv"

In [37]:
train_df = pd.read_csv(train_one_word_answers_file_path)
val_df = pd.read_csv(val_one_word_answers_file_path)

In [38]:
train_df.head()

Unnamed: 0,image_id,question_id,answer,question
0,458752,458752000,net,What is this photo taken looking through?
1,458752,458752001,pitcher,What position is this man playing?
2,458752,458752002,orange,What color is the players shirt?
3,458752,458752003,yes,Is this man a professional baseball player?
4,262146,262146000,white,What color is the snow?


In [39]:
val_df.head()

Unnamed: 0,image_id,question_id,answer,question
0,262148,262148000,down,Where is he looking?
1,262148,262148001,spectating,What are the people in the background doing?
2,262148,262148002,table,What is he on top of?
3,393225,393225000,foodiebakercom,What website copyrighted the picture?
4,393225,393225001,no,Is this a creamy soup?


## Use a dict to store answers & their counts

**NOTE**: we have already preprocessed the answers, so we are sure that we will only get one word answers in lowercase.

In [40]:
answers = dict()

In [41]:
for answer in train_df["answer"]:
    answers[answer] = answers.get(answer, 0) + 1

for answer in val_df["answer"]:
    answers[answer] = answers.get(answer, 0) + 1

In [45]:
print("Total number of answers:", len(train_df) + len(val_df))
print("Total number of unique answers:", len(answers.keys()))

Total number of answers: 588250
Total number of unique answers: 15733


## Get the top 1000 answers

In [46]:
top_answers = sorted(answers.items(), key=lambda x: x[1], reverse=True)[:1000]
top_answers = [answer for answer, _ in top_answers]

In [49]:
print("Top 10th answer:", top_answers[10])
print("Top 100th answer:", top_answers[100])

Top 10th answer: 4
Top 100th answer: bird


In [50]:
print("Top 10th answer frequency:", answers[top_answers[10]])
print("Top 100th answer frequency:", answers[top_answers[100]])

Top 10th answer frequency: 6417
Top 100th answer frequency: 455


## Filter our `train_df` & `val_df` to only keep questions with answers in the top 1000 answers

In [51]:
print("Shape of train_df before filtering:", train_df.shape)
print("Shape of val_df before filtering:", val_df.shape)

Shape of train_df before filtering: (397022, 4)
Shape of val_df before filtering: (191228, 4)


In [52]:
train_df = train_df[train_df["answer"].isin(top_answers)]
val_df = val_df[val_df["answer"].isin(top_answers)]

In [53]:
print("Shape of train_df after filtering:", train_df.shape)
print("Shape of val_df after filtering:", val_df.shape)

Shape of train_df after filtering: (361704, 4)
Shape of val_df after filtering: (173782, 4)


## Save `filtered.csv` files for train & val

In [54]:
train_df.to_csv(filtered_train_file_path, index=False)
val_df.to_csv(filtered_val_file_path, index=False)

# Sample Data to only use 25% of total data

**NOTE**: Run this section for both train & validation filtered.csv files. Comment the train file paths & uncomment the val file paths to run for validation csv files.

In [56]:
filtered_file_path = "./data/train/filtered.csv"
trimmed_file_path = "./data/train/trimmed.csv"
files_path = "./data/train/train2014"
rest_images_path = "./data/train/rest_images"

# filtered_file_path = "./data/val/filtered.csv"
# trimmed_file_path = "./data/val/trimmed.csv"
# files_path = "./data/val/val2014"
# rest_images_path = "./data/val/rest_images"

In [57]:
df = pd.read_csv(filtered_file_path)

In [58]:
df.head()

Unnamed: 0,image_id,question_id,answer,question
0,458752,458752000,net,What is this photo taken looking through?
1,458752,458752001,pitcher,What position is this man playing?
2,458752,458752002,orange,What color is the players shirt?
3,458752,458752003,yes,Is this man a professional baseball player?
4,262146,262146000,white,What color is the snow?


In [59]:
image_ids = df["image_id"].unique()

In [61]:
print("Total number of questions:", len(df))
print("Number of unique images corresponding to those questions:", len(image_ids))

Total number of questions: 361704
Number of unique images corresponding to those questions: 82329


## Shuffle data on `image_id` & pick first 25% of data

$Method_1$: 
- We could randomly shuffle the original data & pick the first 25% of data. However, we might end up with more questions of one type & less of another type. 
- Example: We might end up with more questions related to `yes/no` type than `what/how` type.
- Then, our model might not be able to generalize well on the validation set.

$Method_2$:
- We randomly shuffle list of `image_id` & pick first 25% of `image_ids`.
- We know that each `image_id` has almost equal number of questions of different types.
- In this way, we can ensure that we have an equal mix of different types of questions in our training data.


**NOTE**: We will use $Method_2$ to sample data.

In [62]:
np.random.shuffle(image_ids)

In [63]:
new_data_size = int(len(image_ids) * 0.25)
print("Total images:", len(image_ids))
print("New sampled data size:", new_data_size)

Total images: 82329
New sampled data size: 20582


In [64]:
image_ids = set(image_ids[:new_data_size])

## Trim our data to only keep questions with answers in the sampled `image_ids`

In [65]:
print(f"Original data size: {len(df)}")
df = df[df["image_id"].isin(image_ids)]
print(f"New data size: {len(df)}")

Original data size: 361704
New data size: 90679


## Save `trimmed.csv` file

In [66]:
df.to_csv(trimmed_file_path, index=False)

# Generate Answer Space

In [67]:
trimmed_file_path_train = "./data/train/trimmed.csv"
trimmed_file_path_val = "./data/val/trimmed.csv"

ans_space_file_path = "./data/answer_space.txt"

In [68]:
train_df = pd.read_csv(trimmed_file_path_train)
val_df = pd.read_csv(trimmed_file_path_val)

In [69]:
train_df.head()

Unnamed: 0,image_id,question_id,answer,question
0,458752,458752000,net,What is this photo taken looking through?
1,458752,458752001,pitcher,What position is this man playing?
2,458752,458752002,orange,What color is the players shirt?
3,458752,458752003,yes,Is this man a professional baseball player?
4,262146,262146000,white,What color is the snow?


In [70]:
val_df.head()

Unnamed: 0,image_id,question_id,answer,question
0,393243,393243000,yes,Will this kid leave the powdered sugar on his ...
1,393243,393243001,donut,What is the child eating?
2,393243,393243002,no,Is this person wearing a tie?
3,393243,393243003,blonde,What color is the kids hair?
4,131108,131108000,no,Is this photo color?


## Use both train & val trimmed csv files to generate answer space

**NOTE**: answers can be common in both train & val datasets. So, we will generate answer space from both datasets by storing them in a set first.

In [71]:
answers = set()

In [72]:
for answer in train_df["answer"]:
    answers.add(answer)

for answer in val_df["answer"]:
    answers.add(answer)

In [73]:
with open(ans_space_file_path, "w") as f:
    for answer in answers:
        f.write(str(answer) + "\n")

print("Total number of unique answers:", len(answers))

Total number of unique answers: 1000


# Data Cleaning

In [76]:
train = "./data/train/trimmed.csv"
val = "./data/val/trimmed.csv"

In [81]:
train_df = pd.read_csv(train)
val_df = pd.read_csv(val)

## Remove NULL entries

In [82]:
print("Number of questions in train with no answers:", train_df["answer"].isnull().sum())
print("Number of questions in val with no answers:", val_df["answer"].isnull().sum())

Number of questions in train with no answers: 6
Number of questions in val with no answers: 2


In [83]:
print("Before removing None entries in train_df & val_df", train_df.shape, val_df.shape)
train_df = train_df.dropna(subset=["answer"])
val_df = val_df.dropna(subset=["answer"])
print("After removing None entries in train_df & val_df", train_df.shape, val_df.shape)

Before removing None entries in train_df & val_df (90679, 4) (43285, 4)
After removing None entries in train_df & val_df (90673, 4) (43283, 4)


## Save updated files

In [None]:
train_df.to_csv(train, index=False)
val_df.to_csv(val, index=False)