<br>
<h1 style = "font-size:60px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">EDA and Dataset Versioning <br>using W&B</h1>
<br>

<blockquote>
<span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">Some key points to note in the original data:</span>
<ul>
    <li><span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">Not all the images have a corresponding caption</span></li>
    <li><span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">Some images captions available in many languages</span><br></li>
</ul>
</blockquote>

<span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">This dataset contains 300px <code>b64_bytes</code> for all the images (no downloading required) along with a <code>list of captions</code> in all the available languages</span>

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Install Required Libraries</h1></span>

In [None]:
!pip install --upgrade wandb

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Import Required Libraries 📚</h1></span>

In [None]:
import os

import json

import base64
from io import BytesIO
from PIL import Image
from IPython.display import display

import pickle

<img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67">

<blockquote>
    <span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">We will be using Weights and Biases for:</span>
<ul>
    <li><span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">Visualizing the dataset</span></li>
    <li><span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">Dataset versioning</span><br></li>
</ul>
<span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">W&B Dashboard: <a href="https://wandb.ai/dchanda/Wikipedia">wandb.ai/dchanda/Wikipedia</a></span>
</blockquote>

In [None]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Read Data</h1></span>

<span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">Directory Structure</span>
<blockquote>
<ul>
    <li><span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;"><code>wikipediasmall/</code></span>
    </li>
    <ul>
        <li>part-00000-48a6f07e-bb86-4735-aac7-883349f41a28-c000.json</li>
        <li>part-00001-48a6f07e-bb86-4735-aac7-883349f41a28-c000.json</li>
        <li>part-00002-48a6f07e-bb86-4735-aac7-883349f41a28-c000.json</li>
    </ul>
<span style="color: #000508; font-family: Segoe UI; font-size: 1.3em; font-weight: 300;">The files have been downloaded from <a href="https://analytics.wikimedia.org/published/datasets/one-off/caption_competition/training/joined/">here</a></span>

In [None]:
filenames = os.listdir("../input/wikipediasmall")

json_content = []

for file in filenames:
    filename = os.path.join("../input/wikipediasmall", file)
    with open(filename, 'rb') as f:
        for line in f:
            if line:
                obj = json.loads(line)
                content = {}
                content['caption_title_and_reference_description'] = []
                for element in obj['wit_features']:
                    if element.get('caption_title_and_reference_description'):
                        content['caption_title_and_reference_description'].append(element['caption_title_and_reference_description'])

                content['b64_bytes'] = obj['b64_bytes']

                # Check if captions or images are empty
                if len(content['caption_title_and_reference_description']) > 0 and content['b64_bytes'] != "":
                    json_content.append(content)

In [None]:
json_content[0].keys()

In [None]:
len(json_content)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Visualize 1 Example</h1></span>

In [None]:
index = 100

out = base64.b64decode(json_content[index]['b64_bytes'])
img = Image.open(BytesIO(out)).convert("RGB")
print(json_content[index]['caption_title_and_reference_description'])
display(img)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Visualize Entire Dataset</h1></span>

In [None]:
run = wandb.init(project='Wikipedia',
                 job_type='Visualization',
                 anonymous='must')

In [None]:
preview_table = wandb.Table(columns=['Image', 'Captions'])
for content in json_content[:1000]:
    out = base64.b64decode(content['b64_bytes'])
    img = Image.open(BytesIO(out)).convert("RGB")
    preview_table.add_data(wandb.Image(img), 
                           content['caption_title_and_reference_description'])

wandb.log({'Visualization': preview_table})
run.finish()

<span style="color: #000508; font-family: Segoe UI; font-size: 1.5em; font-weight: 300;"><a href="https://wandb.ai/dchanda/Wikipedia/runs/2kzujq78">View the Complete Table Here ⮕</a></span>

![](https://i.imgur.com/Uebq4gp.gif)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Dataset Versioning</h1></span>

In [None]:
with open("data.pkl", "wb") as fp:
    pickle.dump(json_content, fp)

In [None]:
run = wandb.init(project="Wikipedia", 
                 job_type="data",
                 anonymous="must")
processed_data = wandb.Artifact("Wiki-data", type="dataset",
                                description="Processed data from file 00000 and 00001",
                                metadata={"source": "analytics.wikimedia.org/published/datasets/one-off/caption_competition/training/joined/",
                                          "sizes": len(json_content)})
processed_data.add_file("data.pkl")
run.log_artifact(processed_data)
run.finish()

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Download Data</h1></span>

![](https://i.imgur.com/3neB8pj.jpg)

In [None]:
run = wandb.init(project="Wikipedia", 
                 anonymous="must")
artifact = run.use_artifact('dchanda/Wikipedia/Wiki-data:v0', type='dataset')
artifact_dir = artifact.download()
run.finish()

In [None]:
for file in os.listdir(artifact_dir):
    filepath = os.path.join(artifact_dir, file)
    with open(filepath, "rb") as fp:
        contents = pickle.load(fp)

index = 100

out = base64.b64decode(contents[index]['b64_bytes'])
img = Image.open(BytesIO(out)).convert("RGB")
print(contents[index]['caption_title_and_reference_description'])
display(img)

![Upvote!](https://img.shields.io/badge/Upvote-If%20you%20like%20my%20work-07b3c8?style=for-the-badge&logo=kaggle)