In [10]:
import os
import json

# Define paths
data_dir = 'vqa_data'
text_dir = os.path.join(data_dir, 'Text')
preprocessed_dir = os.path.join(data_dir, 'Preprocessed')
os.makedirs(preprocessed_dir, exist_ok=True)

def preprocess_vqa_data(input_dir, output_dir):
    """
    Preprocess VQA data by loading JSON files, extracting relevant information,
    and saving processed data.
    """
    try:
        for file_name in os.listdir(input_dir):
            if file_name.endswith('.json'):
                file_path = os.path.join(input_dir, file_name)
                print(f"Processing {file_name}...")
                
                # Load JSON data
                with open(file_path, 'r') as file:
                    data = json.load(file)
                
                # Preprocess data (example: extracting questions and annotations)
                if 'questions' in data:  # VQA questions
                    preprocessed_data = {
                        'questions': [
                            {
                                'question_id': item['question_id'],
                                'image_id': item['image_id'],
                                'question': item['question']
                            }
                            for item in data['questions']
                        ]
                    }
                elif 'annotations' in data:  # VQA annotations
                    preprocessed_data = {
                        'annotations': [
                            {
                                'question_id': item['question_id'],
                                'image_id': item['image_id'],
                                'answers': item['answers']
                            }
                            for item in data['annotations']
                        ]
                    }
                else:
                    print(f"Unknown structure in {file_name}, skipping...")
                    continue
                
                # Save preprocessed data
                output_file_path = os.path.join(output_dir, f"preprocessed_{file_name}")
                with open(output_file_path, 'w') as output_file:
                    json.dump(preprocessed_data, output_file, indent=2)
                print(f"Saved preprocessed data to {output_file_path}")
            else:
                print(f"Skipping non-JSON file: {file_name}")
    except Exception as e:
        print(f"Error during preprocessing: {e}")

# Preprocess the data
preprocess_vqa_data(text_dir, preprocessed_dir)


Processing v2_mscoco_train2014_annotations.json...
Saved preprocessed data to vqa_data/Preprocessed/preprocessed_v2_mscoco_train2014_annotations.json
Processing v2_OpenEnded_mscoco_test-dev2015_questions.json...
Saved preprocessed data to vqa_data/Preprocessed/preprocessed_v2_OpenEnded_mscoco_test-dev2015_questions.json
Processing v2_OpenEnded_mscoco_test2015_questions.json...
Saved preprocessed data to vqa_data/Preprocessed/preprocessed_v2_OpenEnded_mscoco_test2015_questions.json
Processing v2_mscoco_val2014_annotations.json...
Saved preprocessed data to vqa_data/Preprocessed/preprocessed_v2_mscoco_val2014_annotations.json
Processing v2_OpenEnded_mscoco_val2014_questions.json...
Saved preprocessed data to vqa_data/Preprocessed/preprocessed_v2_OpenEnded_mscoco_val2014_questions.json
Processing v2_OpenEnded_mscoco_train2014_questions.json...
Saved preprocessed data to vqa_data/Preprocessed/preprocessed_v2_OpenEnded_mscoco_train2014_questions.json
