# Gloabl Wheat Detection - EDA

![GWD](https://albumizr.com/ia/68bce2e2687097c61faee360dc1dce79.jpg)

> In this competition, you’ll detect wheat heads from outdoor images of wheat plants, including wheat datasets from around the globe. Using worldwide data, you will focus on a generalized solution to estimate the number and size of wheat heads. To better gauge the performance for unseen genotypes, environments, and observational conditions, the training dataset covers multiple regions. You will use more than 3,000 images from Europe (France, UK, Switzerland) and North America (Canada). The test data includes about 1,000 images from Australia, Japan, and China.


In [None]:
import numpy as np
import pandas as pd
import cv2
import re
from tqdm.notebook import tqdm
from PIL import Image
import hashlib
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

DIR_INPUT = '/kaggle/input/global-wheat-detection'
DIR_TRAIN_IMAGES = f'{DIR_INPUT}/train'

In [None]:
train_df = pd.read_csv(f'{DIR_INPUT}/train.csv')
train_df.shape

In [None]:
train_df.head()

## Image meta
- We have 3373 unique images in the train dataset
- It seems all of the images hav the same size (1024x1024)
- Minimum number of bbox per image: 0
- Maximum number of bbox per image: 116
- Average number of bbox per image: 43.82
- No duplicated images (in train)

In [None]:
train_df['image_id'].nunique()

In [None]:
train_df['height'].value_counts(), train_df['width'].value_counts()

In [None]:
def calculate_hash(im):
    md5 = hashlib.md5()
    md5.update(np.array(im).tostring())
    
    return md5.hexdigest()
    
def get_image_meta(image_id, image_src, dataset='train'):
    im = Image.open(image_src)
    extrema = im.getextrema()

    meta = {
        'image_id': image_id,
        'dataset': dataset,
        'hash': calculate_hash(im),
        'r_min': extrema[0][0],
        'r_max': extrema[0][1],
        'g_min': extrema[1][0],
        'g_max': extrema[1][1],
        'b_min': extrema[2][0],
        'b_max': extrema[2][1],
        'height': im.size[0],
        'width': im.size[1],
        'format': im.format,
        'mode': im.mode
    }
    return meta

In [None]:
data = []

for i, image_id in enumerate(tqdm(train_df['image_id'].unique(), total=train_df['image_id'].unique().shape[0])):
    data.append(get_image_meta(image_id, DIR_TRAIN_IMAGES + '/{}.jpg'.format(image_id)))

In [None]:
meta_df = pd.DataFrame(data)
meta_df.head()

### Duplications

In [None]:
duplicates = meta_df.groupby(by='hash')[['image_id']].count().reset_index()
duplicates = duplicates[duplicates['image_id'] > 1]
duplicates.reset_index(drop=True, inplace=True)

duplicates = duplicates.merge(meta_df[['image_id', 'hash']], on='hash')

duplicates.head(20)

Great! No duplications!

## Extract bounding box data

In [None]:
train_df['x'] = -1
train_df['y'] = -1
train_df['w'] = -1
train_df['h'] = -1

def expand_bbox(x):
    r = np.array(re.findall("([0-9]+[.]?[0-9]*)", x))
    if len(r) == 0:
        r = [-1, -1, -1, -1]
    return r

train_df[['x', 'y', 'w', 'h']] = np.stack(train_df['bbox'].apply(lambda x: expand_bbox(x)))
train_df.drop(columns=['bbox'], inplace=True)
train_df['x'] = train_df['x'].astype(np.float)
train_df['y'] = train_df['y'].astype(np.float)
train_df['w'] = train_df['w'].astype(np.float)
train_df['h'] = train_df['h'].astype(np.float)

In [None]:
train_df

In [None]:
train_df.groupby(by='image_id')['source'].count().agg(['min', 'max', 'mean'])

In [None]:
source = train_df['source'].value_counts()
source

In [None]:
fig = go.Figure(data=[
    go.Pie(labels=source.index, values=source.values)
])

fig.update_layout(title='Source distribution')
fig.show()

In [None]:
def show_images(image_ids):
    
    col = 5
    row = min(len(image_ids) // col, 5)
    
    fig, ax = plt.subplots(row, col, figsize=(16, 8))
    ax = ax.flatten()

    for i, image_id in enumerate(image_ids):
        image = cv2.imread(DIR_TRAIN_IMAGES + '/{}.jpg'.format(image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        ax[i].set_axis_off()
        ax[i].imshow(image)
        ax[i].set_title(image_id)
        
def show_image_bb(image_data):
    
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
    image = cv2.imread(DIR_TRAIN_IMAGES + '/{}.jpg'.format(image_data.iloc[0]['image_id']))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    for i, row in image_data.iterrows():
        
        cv2.rectangle(image,
                      (int(row['x']), int(row['y'])),
                      (int(row['x']) + int(row['w']), int(row['y']) + int(row['h'])),
                      (220, 0, 0), 3)

    ax.set_axis_off()
    ax.imshow(image)
    ax.set_title(image_id)

In [None]:
show_images(train_df.sample(n=15)['image_id'].values)

In [None]:
show_image_bb(train_df[train_df['image_id'] == '5e0747034'])

In [None]:
show_image_bb(train_df[train_df['image_id'] == '5b13b8160'])

In [None]:
show_image_bb(train_df[train_df['image_id'] == '1f2b1a759'])