**About** : This notebook is used to train models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import sys
import cv2
import glob
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

## Processing

In [None]:
def get_annotations(annotations_path: str = "../input/train/annotations") -> pd.DataFrame:
    """
    Loads and returns a pandas DataFrame containing all annotations found at the given path.

    Args:
        annotations_path (str): Path to the directory containing the annotations.
        
    Returns:
        pd.DataFrame: A pandas DataFrame containing the annotations.
    """
    annotations = []
    for annotation_path in tqdm(os.listdir(annotations_path)):
        with open(f"{annotations_path}/{annotation_path}") as annotation_f:
            annots = json.load(annotation_f)
            annots['id'] = annotation_path[:-5]
            annotations.append(annots)
            
    df = pd.DataFrame(annotations)
    df = df[['id', 'source', 'chart-type', 'plot-bb', 'text', 'axes', 'data-series', 'visual-elements']]
    return df

In [None]:
df = get_annotations()

In [None]:
# df = df[df['id'] == "e4cf7f5f6be6"]

In [None]:
# for i, e in tqdm(enumerate(df['visual-elements'].values)):
#     if sum([len(e[k]) > 0 for k in e]) != 1:
#         print( sum([len(e[k]) for k in e]))
#         print(i, e)
#         break

In [None]:
sns.countplot(x="chart-type", hue="source", data=df)
plt.yscale('log')
plt.show()

### Process columns

In [None]:
df['plot_x0'] = df['plot-bb'].apply(lambda x: x['x0'])
df['plot_y0'] = df['plot-bb'].apply(lambda x: x['y0'])
df['plot_h'] = df['plot-bb'].apply(lambda x: x['height'])
df['plot_w'] = df['plot-bb'].apply(lambda x: x['width'])

In [None]:
df['x_axis'] = df['axes'].apply(lambda x: x['x-axis'])
df['y_axis'] = df['axes'].apply(lambda x: x['y-axis'])

df['x_ticks_type'] = df['x_axis'].apply(lambda x: x['tick-type'])
df['x_values_type'] = df['x_axis'].apply(lambda x: x['values-type'])
df['y_ticks_type'] = df['y_axis'].apply(lambda x: x['tick-type'])
df['y_values_type'] = df['y_axis'].apply(lambda x: x['values-type'])

df['x_ticks'] = df['x_axis'].apply(lambda x: x['ticks'])
df['y_ticks'] = df['y_axis'].apply(lambda x: x['ticks'])

In [None]:
df.head(2)

### Visual elements

In [None]:
df['elements_type'] = df['visual-elements'].apply(lambda x: [k for k in x if len(x[k])][0])

In [None]:
df['visual-elements'] = df.apply(lambda x: x['visual-elements'][x.elements_type], axis=1)

In [None]:
# df['visual-elements'] = df['visual-elements'].apply(lambda x: [k for k in x if len(k)])
df['visual-elements'] = df['visual-elements'].apply(lambda x: x[0] if ((len(x) == 1) and isinstance(x[0], list)) else x)

ANOMALIES = [
    # DUPLICATED LINES
    'ae686738e744', 'c76f6d0d5239', '760c3fa4e3d9', 'c0c1f4046222', '3e568d136b85', '913447978a74', '2ff071a45cce', 'a9a07d74ee31',
    # MISSING ANNOTS
]

df['visual-elements'] = df['visual-elements'].apply(lambda x: x[0] if isinstance(x[0], list) else x)

In [None]:
# df['nve'] = df['visual-elements'].apply(len)

In [None]:
df_elt = df[['id', 'visual-elements']].explode('visual-elements').reset_index(drop=True)

df_elt = df_elt.rename(columns={"id": "chart_id"})

In [None]:
df_elt['x'] = df_elt['visual-elements'].apply(lambda x: x.get('x', np.nan))
df_elt['y'] = df_elt['visual-elements'].apply(lambda x: x.get('y', np.nan))

df_elt['x0'] = df_elt['visual-elements'].apply(lambda x: x.get('x0', np.nan))
df_elt['y0'] = df_elt['visual-elements'].apply(lambda x: x.get('y0', np.nan))
df_elt['h'] = df_elt['visual-elements'].apply(lambda x: x.get('height', np.nan))
df_elt['w'] = df_elt['visual-elements'].apply(lambda x: x.get('width', np.nan))

In [None]:
df_elt.drop('visual-elements', axis=1, inplace=True)

In [None]:
df_elt.head(2)

### Texts & ticks

In [None]:
df_text = df[['id', 'text']].explode('text').reset_index(drop=True)

df_text = df_text.rename(columns={"id": "chart_id"})

df_text['id'] = df_text['text'].apply(lambda x: x['id'])
df_text["polygon"] = df_text['text'].apply(lambda x: x['polygon'])
df_text['text'] = df_text['text'].apply(lambda x: x['text'])

df_text["polygon"] = df_text["polygon"].apply(lambda x: list(x.values()))
df_text[['x0','x1', 'x2', 'x3', 'y0', 'y1', 'y2', 'y3']] = pd.DataFrame(df_text.polygon.tolist(), index=df_text.index)
df_text.drop("polygon", axis=1, inplace=True)

In [None]:
df_text['x_min'] = df_text[['x0','x1', 'x2', 'x3']].values.min(1)
df_text['x_max'] = df_text[['x0','x1', 'x2', 'x3']].values.max(1)
df_text['y_min'] = df_text[['y0','y1', 'y2', 'y3']].values.min(1)
df_text['y_max'] = df_text[['y0','y1', 'y2', 'y3']].values.max(1)

In [None]:
df_x_ticks = df[['id', 'x_ticks']].explode('x_ticks').rename(columns={"x_ticks": "ticks"})
df_x_ticks['axis'] = "x"

df_y_ticks = df[['id', 'y_ticks']].explode('y_ticks').rename(columns={"y_ticks": "ticks"})
df_y_ticks['axis'] = "y"

df_ticks = pd.concat([df_x_ticks, df_y_ticks], ignore_index=True)

df_ticks = df_ticks.rename(columns={"id": "chart_id"})

df_ticks['id'] = df_ticks['ticks'].apply(lambda x: x['id'] if isinstance(x, dict) else np.nan)
df_ticks['x'] = df_ticks['ticks'].apply(lambda x: x['tick_pt']['x'] if isinstance(x, dict) else np.nan)
df_ticks['y'] = df_ticks['ticks'].apply(lambda x: x['tick_pt']['y'] if isinstance(x, dict) else np.nan)

df_ticks = df_ticks.dropna(axis=0)
df_ticks['id'] = df_ticks['id'].astype(int)
df_ticks = df_ticks.drop("ticks", axis=1)

In [None]:
df_text = df_text.merge(df_ticks, on=['chart_id', "id"], how="left")

In [None]:
df_text.head(2)

### Target

In [None]:
df_target = df[['id', 'data-series']].explode('data-series').reset_index(drop=True)

In [None]:
df_target['x'] = df_target['data-series'].apply(lambda x: x['x'])
df_target['y'] = df_target['data-series'].apply(lambda x: x['y'])

In [None]:
df_target.drop('data-series', axis=1, inplace=True)

In [None]:
# df_target[df_target['y'].isna()]

In [None]:
# histograms = ["6447c2a5e487", "a613be731d61",  "60923b97d2b5",  "4163f70a77b3",  "9affc9b7cb76", "62a5fe77db68", "353fab6e4d7a",  "8e71f5f4f2d2",  "c25da96d5aaf",  "6771e4a4fab5"]
# df[df['id'].isin(histograms)]['data-series'][2851]

In [None]:
df_target.head()

### Finalize

In [None]:
df = df.drop(["plot-bb", 'text', "axes", "x_axis", "y_axis", "x_ticks", "y_ticks", "data-series", "visual-elements"], axis=1)

In [None]:
df.to_csv('../input/df_train.csv', index=False)
df_text.to_csv('../input/texts.csv', index=False)
df_target.to_csv('../input/y_train.csv', index=False)
df_elt.to_csv('../input/elements.csv', index=False)

In [None]:
df.head(1)

In [None]:
df_elt.head(1)

In [None]:
df_text.head(1)

In [None]:
df_target.head(1)

### Image shape

In [None]:
heights, widths = [], []

for i in tqdm(range(len(df))):
    file = df['id'].values[i]
    img = cv2.imread(f'../input/train/images/{file}.jpg')
    
    h, w, _ = img.shape
    
    heights.append(h)
    widths.append(w)
#     break

In [None]:
df['img_h'] = heights
df['img_w'] = widths
df.to_csv('../input/df_train.csv', index=False)

In [None]:
df.head()

Done ! 