In [3]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-win_amd64.whl (11.6 MB)
Collecting pytz>=2020.1
  Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Collecting numpy>=1.22.4
  Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
Collecting tzdata>=2022.7
  Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: tzdata, pytz, numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.2.2 pytz-2024.1 tzdata-2024.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\P.Ramsai Koushik\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


# Dataset Preprocessing

In [6]:
import os
import pandas as pd

### Create a dataframe to store the metadata of train images

Create a dataframe with image_name, image_id, image_path, hashed_image_id(this is used for random sampling)

In [30]:
# Path to the folder containing the image files
folder_path = "train2014"

# Get a list of all files in the folder
files = os.listdir(folder_path)

# Filter out only the image files
image_files = [file for file in files if file.endswith('.jpg')]

# Extract the image number from the file name
image_numbers = [int(file.split('_')[-1].split('.')[0]) for file in image_files]

# Create a DataFrame
df = pd.DataFrame({
    'image_name': image_files,
    'image_path': [os.path.join(folder_path, file) for file in image_files],
    'image_id': image_numbers
})

# Sort the DataFrame by numbers
df_sorted = df.sort_values(by='image_id').reset_index(drop=True)

# Create a mapping dictionary
mapping = {number: index + 1 for index, number in enumerate(df_sorted['image_id'])}

def custom_hash(x):
    return mapping[x]

df['hashed_image_id'] = df['image_id'].apply(custom_hash)

# Sort the DataFrame by hashed number
df.sort_values(by='hashed_image_id', inplace=True)

# Reset index to make sure it's sequential
df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(df.head())

                        image_name                                 image_path  \
0  COCO_train2014_000000000009.jpg  train2014\COCO_train2014_000000000009.jpg   
1  COCO_train2014_000000000025.jpg  train2014\COCO_train2014_000000000025.jpg   
2  COCO_train2014_000000000030.jpg  train2014\COCO_train2014_000000000030.jpg   
3  COCO_train2014_000000000034.jpg  train2014\COCO_train2014_000000000034.jpg   
4  COCO_train2014_000000000036.jpg  train2014\COCO_train2014_000000000036.jpg   

   image_id  hashed_image_id  
0         9                1  
1        25                2  
2        30                3  
3        34                4  
4        36                5  


In [31]:
print(max(df['hashed_image_id']))

82783


In [61]:
df.to_csv('train_images.csv',index=False)

In [None]:
#### C

#### Random Sampling of dataset (tain images)

Create 20,695(25 percent of 82,783) random indices in between 1 and 82,783

In [32]:
import random

random.seed(42) # ensures that we get the same random indices everytime we run the notebook

# Number of indices
num_indices = len(df)//4

# Range of numbers
num_range = df['hashed_image_id'].max()

# Generate a list of random indices
random_indices = random.sample(range(1, num_range + 1), num_indices)

# Create a new DataFrame containing rows where the hashed number is present in the random indices list
sampled_df = df[df['hashed_image_id'].isin(random_indices)].reset_index(drop=True)

print(sampled_df.head())

                        image_name                                 image_path  \
0  COCO_train2014_000000000025.jpg  train2014\COCO_train2014_000000000025.jpg   
1  COCO_train2014_000000000036.jpg  train2014\COCO_train2014_000000000036.jpg   
2  COCO_train2014_000000000064.jpg  train2014\COCO_train2014_000000000064.jpg   
3  COCO_train2014_000000000078.jpg  train2014\COCO_train2014_000000000078.jpg   
4  COCO_train2014_000000000081.jpg  train2014\COCO_train2014_000000000081.jpg   

   image_id  hashed_image_id  
0        25                2  
1        36                5  
2        64                8  
3        78               12  
4        81               13  


In [33]:
print(len(sampled_df), min(sampled_df['hashed_image_id']),max(sampled_df['hashed_image_id']))

20695 2 82782


In [60]:
sampled_df.to_csv('sampled_train_images.csv',index=False)

### Store the questions in csv format 

In [47]:
import json
# Read the JSON file
with open('v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json', 'r') as f:
    questions_data = json.load(f)

In [53]:
questions_data['questions']

[{'image_id': 458752,
  'question': 'What is this photo taken looking through?',
  'question_id': 458752000},
 {'image_id': 458752,
  'question': 'What position is this man playing?',
  'question_id': 458752001},
 {'image_id': 458752,
  'question': 'What color is the players shirt?',
  'question_id': 458752002},
 {'image_id': 458752,
  'question': 'Is this man a professional baseball player?',
  'question_id': 458752003},
 {'image_id': 262146,
  'question': 'What color is the snow?',
  'question_id': 262146000},
 {'image_id': 262146,
  'question': 'What is the person doing?',
  'question_id': 262146001},
 {'image_id': 262146,
  'question': 'What color is the persons headwear?',
  'question_id': 262146002},
 {'image_id': 524291,
  'question': "What is in the person's hand?",
  'question_id': 524291000},
 {'image_id': 524291,
  'question': 'Is the dog waiting?',
  'question_id': 524291001},
 {'image_id': 524291,
  'question': 'Is the dog looking at a tennis ball or frisbee?',
  'question

In [36]:
# Extract the 'questions' parameter
questions_data = questions_data['questions']

# Create a DataFrame from the 'questions' data
questions_df = pd.DataFrame(questions_data)
questions_df.sort_values(by='image_id',inplace=True)

In [66]:
questions_df.to_csv('train_questions.csv',index=False)

Filter out the rows to store only the questions corresponding to the images in the sampled dataset

In [37]:
sampled_questions_df  = questions_df[questions_df['image_id'].apply(custom_hash).isin(random_indices)].reset_index(drop=True)

In [64]:
sampled_questions_df.to_csv('sampled_train_questions.csv',index=False)

### Store the answers in CSV format

In [40]:
# Read the JSON file
with open('v2_Annotations_Train_mscoco//v2_mscoco_train2014_annotations.json', 'r') as f:
    annotations_data = json.load(f)

In [54]:
annotations_data

[{'question_type': 'what is this',
  'multiple_choice_answer': 'net',
  'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3},
   {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6},
   {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9},
   {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}],
  'image_id': 458752,
  'answer_type': 'other',
  'question_id': 458752000},
 {'question_type': 'what',
  'multiple_choice_answer': 'pitcher',
  'answers': [{'answer': 'pitcher',
    'answer_confidence': 'yes',
    'answer_id': 1},
   {'answer': 'c

In [69]:
# Create a DataFrame from the 'annotaions' data
annotations_df = pd.DataFrame(annotations_data)
annotations_df.sort_values(by='image_id',inplace=True)

In [70]:
annotations_df.head()

Unnamed: 0,question_type,multiple_choice_answer,answers,image_id,answer_type,question_id
903,what is the,broccoli,"[{'answer': 'broccoli', 'answer_confidence': '...",9,other,9002
902,what color are the,pink and yellow,"[{'answer': 'pink and yellow', 'answer_confide...",9,other,9001
901,how many,2,"[{'answer': '2', 'answer_confidence': 'yes', '...",9,number,9000
95,is the,no,"[{'answer': 'no', 'answer_confidence': 'yes', ...",25,yes/no,25013
94,are,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",25,yes/no,25012


In [71]:
annotations_df.columns

Index(['question_type', 'multiple_choice_answer', 'answers', 'image_id',
       'answer_type', 'question_id'],
      dtype='object')

In [67]:
annotations_df.to_csv('train_annotations.csv',index=False)

Filter out the rows to store only the questions corresponding to the images in the sampled dataset

In [76]:
sampled_annotations_df = annotations_df[annotations_df['image_id'].apply(custom_hash).isin(random_indices)].reset_index(drop=True)

In [77]:
sampled_annotations_df.to_csv('sampled_train_annotaions_df.csv',index=False)

In [79]:
print(len(sampled_annotations_df),len(sampled_questions_df),len(df))

109972 109972 82783


In [81]:
sampled_df.columns , sampled_annotations_df.columns, sampled_questions_df.columns

(Index(['image_name', 'image_path', 'image_id', 'hashed_image_id'], dtype='object'),
 Index(['question_type', 'multiple_choice_answer', 'answers', 'image_id',
        'answer_type', 'question_id'],
       dtype='object'),
 Index(['image_id', 'question', 'question_id'], dtype='object'))

In [85]:
sampled_questions_annotations_df = pd.merge(sampled_questions_df, sampled_annotations_df, on=['question_id','image_id'])

In [89]:
sampled_questions_annotations_df.columns, len(sampled_questions_annotations_df)

(Index(['image_id', 'question', 'question_id', 'question_type',
        'multiple_choice_answer', 'answers', 'answer_type'],
       dtype='object'),
 109972)

In [88]:
sampled_questions_annotations_df.to_csv('sampled_train_questions_annotations.csv',index=False)

In [90]:
sampled_train_dataset_df = pd.merge(sampled_questions_annotations_df,sampled_df,on='image_id')

In [92]:
sampled_train_dataset_df.columns, len(sampled_train_dataset_df)

(Index(['image_id', 'question', 'question_id', 'question_type',
        'multiple_choice_answer', 'answers', 'answer_type', 'image_name',
        'image_path', 'hashed_image_id'],
       dtype='object'),
 109972)

In [94]:
sampled_train_dataset_df.to_csv('sampled_train_dataset.csv',index=False) 

## Delete the images which are not being used for training

In [95]:
# Path to the folder containing the image files
folder_path = "train2014"

for filename in os.listdir(folder_path):
    # Extract the number from the filename
    number = int(filename.split('_')[-1].split('.')[0])
    
    # Get the hash value of the number
    hash_value = custom_hash(number)
    
    # Check if the hash value is present in the random indices list
    if hash_value not in random_indices:
        # If not present, delete the file
        file_path = os.path.join(folder_path, filename)
        os.remove(file_path)

Adding a new column to store the relative path in Kaggle 

In [98]:
sampled_train_dataset_kaggle = pd.read_csv('sampled_train_dataset.csv')

def append_string(value):
    return os.path.join('train2014', value)

# Apply the function to each value in the column
sampled_train_dataset_kaggle['kaggle_image_path'] = sampled_train_dataset_kaggle['image_path'].apply(append_string)

In [97]:
sampled_train_dataset_kaggle.to_csv('sampled_train_dataset_kaggle.csv',index=False)

## Understanding the dataset

In [2]:
import pandas as pd
df = pd.read_csv('sampled_train_dataset.csv')

In [3]:
df.head()

Unnamed: 0,image_id,question,question_id,question_type,multiple_choice_answer,answers,answer_type,image_name,image_path,hashed_image_id
0,25,Is the giraffe in the shade?,25013,is the,no,"[{'answer': 'no', 'answer_confidence': 'yes', ...",yes/no,COCO_train2014_000000000025.jpg,train2014\COCO_train2014_000000000025.jpg,2
1,25,Are any of the animals eating?,25012,are,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",yes/no,COCO_train2014_000000000025.jpg,train2014\COCO_train2014_000000000025.jpg,2
2,25,Are some of the trees dead?,25011,are,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",yes/no,COCO_train2014_000000000025.jpg,train2014\COCO_train2014_000000000025.jpg,2
3,25,What is on the ground next to the giraffe on t...,25010,what is on the,log,"[{'answer': 'log', 'answer_confidence': 'yes',...",other,COCO_train2014_000000000025.jpg,train2014\COCO_train2014_000000000025.jpg,2
4,25,Are they at a zoo?,25009,are they,yes,"[{'answer': 'yes', 'answer_confidence': 'maybe...",yes/no,COCO_train2014_000000000025.jpg,train2014\COCO_train2014_000000000025.jpg,2


In [4]:
df['answer_type'].unique()

array(['yes/no', 'other', 'number'], dtype=object)

In [10]:
answers_test = eval(df['answers'].iloc[0])

In [11]:
print(answers_test)

[{'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'no', 'answer_confidence': 'maybe', 'answer_id': 4}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 10}]
