# Competition

In this competition, you’ll build a model that automatically retrieves the text closest to an image. Specifically, you'll train your model to associate given images with article titles or complex captions, in multiple languages. The best models will account for the semantic granularity of Wikipedia images.

The objective is to predict the target *'caption_title_and_reference_description'* given information about an images
# Evaluation

Submissions will be evaluated using NDCG@5 (Normalized Discounted Cumulative Gain).

This Notebook uses external dataset [feather format](https://www.kaggle.com/msafi04/train-tsv-file-to-feather-files) created from the train tsv files

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import dask
import dask.dataframe as dd

import cv2
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import iplot

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set2')

import glob as glob
import gc

import requests
import urllib

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

import warnings
warnings.simplefilter('ignore')

Use dask to read the dataframes, its fast!

In [None]:
def tsv_to_feather(tsv_files):
    for i, tsv in enumerate(tsv_files):
        print(f"Processingc file: {i + 1} ...")
        df = dd.read_csv(tsv, sep = '\t', quoting = 3, escapechar = '\n', 
                         on_bad_lines = 'skip', dtype = 'string')
        df = df.dropna()
        df = df.compute() #Convert to pandas df
        df = df.reset_index(drop = True)
        #Save as feature to save disk space and read faster
        df.to_feather(tsv.split('/')[-1].split('.')[0])
        print(f"Tsv file {tsv.split('/')[-1].split('.')[0]} stored as feather file")

        del df
        gc.collect()

In [None]:
%%time
train_tsvs = glob.glob('/kaggle/input/wikipedia-image-caption/train*.tsv')
#tsv_to_feather(train_tsvs)

In [None]:
train_external = '/kaggle/input/train-tsv-file-to-feather-files/'

In [None]:
test_df = pd.read_csv('/kaggle/input/wikipedia-image-caption/test.tsv', sep = '\t')
print(test_df.shape)
test_df.head()

In [None]:
sub = pd.read_csv('/kaggle/input/wikipedia-image-caption/sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
train_feathers = glob.glob(train_external + 'train*')
print(train_feathers)

- Read all the feather files and make it into one dataframe

In [None]:
train_df = pd.DataFrame()
for file in train_feathers:
    df = pd.read_feather(file)
    train_df = pd.concat([train_df, df])
print(f"Before removing duplicate rows: {train_df.shape}")
train_df = train_df.drop_duplicates() #Drop duplicate rows if any
print(f"After removing duplicate rows: {train_df.shape}")
train_df = train_df.sample(frac = 1).reset_index(drop = True)
train_df.head()

__Missing Values__

In [None]:
train_df.isna().any()

In [None]:
train_df['language'].unique()

In [None]:
train_df['language'].value_counts()

## Train Image and Caption Visualization

In [None]:
def url_to_images(df: pd.DataFrame, num_images: int, flag: str) -> tuple:
    images_to_display = []
    #Get random 'num_images' url from df
    if flag == 'train':
        sample_df = df[['image_url', 'caption_title_and_reference_description']].sample(num_images)
    else:
        sample_df = df[['image_url']].sample(num_images)
    URLS = sample_df['image_url'].tolist()
    for img_url in URLS:
        try:
            with urllib.request.urlopen(img_url) as url:
                with open('./temp.jpg', 'wb') as f:
                    f.write(url.read())
            img = cv2.imread('./temp.jpg')
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            images_to_display.append(img)
        except:
            continue
    if flag == 'train':
        return images_to_display, sample_df['caption_title_and_reference_description'].tolist()
    else:
        return images_to_display

In [None]:
from textwrap import wrap

def display_images(df: pd.DataFrame, rows: int, cols: int, flag: str = 'train') -> None:
    if flag == 'train':
        images, captions = url_to_images(df, num_images = rows * cols, flag = flag)
    else:
        images = url_to_images(df, num_images = rows * cols, flag = flag)
    
    fig, ax = plt.subplots(rows, cols, figsize = (20, 12))
    ax = ax.flatten()
    for p in range(rows * cols):
        try:
            ax[p].imshow(images[p])
            ax[p].grid(False)
            ax[p].axis('off')
        except:
            continue
        if flag == 'train':
            ax[p].set_title('\n'.join(wrap(captions[p], 30)))
    fig.tight_layout()
    plt.show()
    return None

In [None]:
display_images(train_df, 3, 3, 'train')

In [None]:
display_images(train_df, 3, 3, 'train')

## Display Test Images

- Looks like there are GIF, SVG, TIF, PNG formats as well in addition to JPG

In [None]:
display_images(test_df, 3, 3, 'test')

In [None]:
def clean_format(x):
    if (x.lower() == 'jpg') or (x.lower() == 'jpeg'):
        return 'jpg'
    elif (x.lower() == 'png'):
        return 'png'
    elif (x.lower() == 'tif') or (x.lower() == 'tiff'):
        return 'tif'
    elif (x.lower() == 'svg'):
        return 'svg'
    elif (x.lower() == 'gif'):
        return 'gif'

In [None]:
test_df['format'] = test_df['image_url'].apply(lambda x: x.split('.')[-1])
test_df['format'] = test_df['format'].apply(clean_format)
ax = sns.countplot(test_df['format'])
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.title('Test Image Formats Counts');

## Target Text

In [None]:
train_df['target_len'] = train_df['caption_title_and_reference_description'].apply(lambda x: len(x.split()))
train_df['target_len'].hist();

In [None]:
print(f"Max no. of words in target: {train_df['target_len'].max()}")
print(f"Min no. of words in target: {train_df['target_len'].min()}")
print(f"Avg. no. of words in target: {train_df['target_len'].mean()}")

There is a sample with no target caption, remove this from train dataset

In [None]:
train_df = train_df[train_df['target_len'] != 0]
print(f"Max no. of words in target: {train_df['target_len'].max()}")
print(f"Min no. of words in target: {train_df['target_len'].min()}")
print(f"Avg. no. of words in target: {train_df['target_len'].mean()}")
train_df.shape

# WIP
More to come...