# Extracting the product images from the given BSON file

## Make the folders

In [1]:
import io
import bson 
from skimage.io import imread, imsave 
import os
import pandas as pd
from tqdm import tqdm

# initialize the file names and the folder names that we will use 
bson_file = 'train.bson'
image_dir = 'train_images'

#create a folder to save the images 
if not os.path.exists(image_dir):  # if it does not exists, we create it
    os.makedirs(image_dir)

### As there are 7 million images in total, we cannot extract all of them and it takes up a huge amount of space which is not available in a single box they take up a total of more than 700GB of space if extracted.

* Hence we extract only 1 million images by uniformly sampling across the whole dataset so that we can cover maximum amount of product categories with minimum number of images.
* Here we will skip 6 million images while parsing through the BSON file.

In [4]:
import random
n = 7069896 #total number of data points 
sample_size = 1000000 
skip_values = sorted(random.sample(range(1,n), n-sample_size))

### Data extraction from the BSON (Binary Java Script Object Notation) file

In [12]:
# Here we are not able to use multiprocessing due to various compatibility issues which are leading higher overhead time and crash issues.
# Data processing
import warnings
warnings.filterwarnings("ignore")

import time
start_time= time.time()

data = bson.decode_file_iter(open('train.bson', 'rb'))

#variables to store various attributes about the data which we will use to construct a dataframe
product_ids = []
category_ids = []
image_paths = []

#keep track of the count till we extract 1 million images 
count=1

#Loop through the bson generator to get the data
for c, d in tqdm(enumerate(data)):
    
    #Here we will skip the 6 million images
    if c in skip_values:
        continue
    
    #store the product id and category id 
    product_id = d['_id']
    category_id = d['category_id']
    
    #lop to extract the images of the given product id
    #Here as there are mnultiple images available for each product we will extract only 1 image per product id
    for e, pic in enumerate(d['imgs'],):
        # This is used to terminate the loop after completing 1 iteration
        if e != 0:
            break
        #read the image from the binary format 
        picture = imread(io.BytesIO(pic['picture']))
        
        #save the image name with product id and category id for easy inference
        path = image_dir +'/'+ str(product_id) + "_" + str(category_id) + ".png"
        imsave(path, picture)
        
        #save the product_id, category_id and image-path in their respective lists
        product_ids.append(product_id)
        category_ids.append(category_id)
        image_paths.append(path)
    
    #increment the count value and stop the loop after reaching 1 million images 
    count+=1
    if count == 1000000:
        break

#finally print the time taken for the total extraction
print("time elapsed is ", time.time() - start_time)

7069893it [58:04:38, 33.81it/s] 

time elapsed is  209078.5652191639





## The whole extraction process took 58 hours...!

### We will create a dataframe and store it to the disk for later use in exploratory data analysis

In [14]:
# Save all of them in a dataframe

df = pd.DataFrame(list(zip(image_paths, product_ids, category_ids)), columns=['image_path', 'product_id', 'category_id'])
print('created the dataframe')
df.to_csv('train_data.csv',index=False)
print('saved to csv file')


created the dataframe
saved to csv file


In [15]:
df.head()

Unnamed: 0,image_path,product_id,category_id
0,train_images/0_1000010653.png,0,1000010653
1,train_images/7_1000004079.png,7,1000004079
2,train_images/9_1000018290.png,9,1000018290
3,train_images/15_1000015309.png,15,1000015309
4,train_images/19_1000014287.png,19,1000014287
