**About** : This notebook is used to train detection models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "-1"

In [None]:
import os
import cv2
import sys
import ast
import glob
import json
import yaml
import shutil
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [None]:
from params import *
from util.plots import plot_annotated_image, plot_sample
from util.torch import seed_everything
from util.yolo import *

### Load data

In [None]:
df = pd.read_csv('../input/df_train.csv')
df_text = pd.read_csv('../input/texts.csv')
df_target = pd.read_csv('../input/y_train.csv')
df_elt = pd.read_csv('../input/elements.csv')

### Split

In [None]:
# SEED = 42
# seed_everything(SEED)

# split = {}
# for i in range(len(df)):
#     split[df['id'][i]] = "train"

#     if df['source'][i] == "extracted":
#         split[df['id'][i]] = "val"
        
#         if df['chart-type'][i] == "horizontal_bar":
#             if np.random.random() > 0.3:
#                 split[df['id'][i]] = "train"
#     else:
#         if df['chart-type'][i] == "dot":
#             if np.random.random() < 0.2:
#                 split[df['id'][i]] = "val"
                
# df_split = pd.DataFrame.from_dict(split, orient="index").reset_index()
# df_split.columns = ['id', 'split']
# df_split.to_csv('../input/df_split.csv', index=False)

## EDA

In [None]:
df['split'] = df['source'].map({"generated": "train", "extracted": "val"})

In [None]:
# df_split = pd.read_csv('../input/df_split.csv')
# df = df.merge(df_split)

# sns.countplot(x="chart-type", hue="split", data=df)
# # plt.yscale('log')
# plt.show()

In [None]:
sns.countplot(x="chart-type", hue="source", data=df)
plt.yscale('log')
plt.show()

In [None]:
# df[df['source'] != "generated"].head()
# df.head()

In [None]:
ANOMALIES = [
    # DUPLICATED STUFF
    'ae686738e744', 'c76f6d0d5239', '760c3fa4e3d9', 'c0c1f4046222', '3e568d136b85', '913447978a74', '2ff071a45cce', 'a9a07d74ee31',
    # MISSING or MISLABELED TICKS ANNOTS
    "36079df3b5b2", "3968efe9cbfc", "6ce4bc728dd5", "733b9b19e09a", "aa9df520a5f2", "d0cf883b1e13",
    # WEIRD
    "9f6b7c57e6cd", "e1034ff92655", "e796b10718bd", "f8bdbaf0b97d", "3ef41bbc82c3", "73cfbba65962", "872d1be39bae", "3ef41bbc82c3"
]

In [None]:
df = df[~df['id'].isin(ANOMALIES)].reset_index(drop=True)

In [None]:
# file = np.random.choice(df[df['chart-type'] == "dot"].id.values)

In [None]:
# file = '6f36d53ecec8'

In [None]:
fig = plot_annotated_image(file)

In [None]:
CHART_TYPES = [
#     "vertical_bar",
#     "horizontal_bar",
    "dot",
#     "line",
    "scatter",
]

CLASSES = [
#     "chart",
#     "text",
#     "tick",
    "point",
]

USE_GENERATED = True

VERSION = 13

df = df[df['chart-type'].isin(CHART_TYPES)].reset_index(drop=True)

In [None]:
# df = df[df['source'] == "extracted"]

In [None]:
# df = df.merge(df_target.groupby('id').agg(list), on="id", how="left")

In [None]:
# df['y'] = df['y'].apply(lambda x: np.min(np.array(x).astype(float)))

In [None]:
# df['x'] = df['x'].apply(lambda x: np.min(np.array(x).astype(float)))

In [None]:
# df[df['y'] < 0].id.values

In [None]:
# sns.countplot(x=np.clip(df.y.values // 10, -10, 50))

## Yolo preparation

### Folders

In [None]:
YOLO_PATH = '../yolov7/'
DATA_PATH = '../input/'

In [None]:
label_dict = {k: i for i, k in enumerate(CLASSES)}
label_dict

#(1) image file path
yolo_train_img_dir = f'{DATA_PATH}/v{VERSION}/images/train/'
yolo_valid_img_dir = f'{DATA_PATH}/v{VERSION}/images/valid/'

#(2) label file path
yolo_train_label_dir = f'{DATA_PATH}/v{VERSION}/labels/train/'
yolo_valid_label_dir = f'{DATA_PATH}/v{VERSION}/labels/valid/'

#(3) config file path
yaml_file = f'{YOLO_PATH}/data_{VERSION}.yaml'

os.makedirs(yolo_train_img_dir, exist_ok=True)
os.makedirs(yolo_valid_img_dir, exist_ok=True)
os.makedirs(yolo_train_label_dir, exist_ok=True)
os.makedirs(yolo_valid_label_dir, exist_ok=True)

shutil.rmtree(yolo_train_img_dir)
shutil.rmtree(yolo_valid_img_dir)
shutil.rmtree(yolo_train_label_dir)
shutil.rmtree(yolo_valid_label_dir)

os.makedirs(yolo_train_img_dir, exist_ok=True)
os.makedirs(yolo_valid_img_dir, exist_ok=True)
os.makedirs(yolo_train_label_dir, exist_ok=True)
os.makedirs(yolo_valid_label_dir, exist_ok=True)

yolo_train_img_dir, yolo_valid_img_dir, yolo_train_label_dir, yolo_valid_label_dir

### Loop

In [None]:
df_text = df_text[~df_text['axis'].isna()].reset_index(drop=True)  # ignore titles

In [None]:
dfts = {}
for id_, dfg in tqdm(df_text.groupby('chart_id')):
    dfts[id_] = dfg.reset_index(drop=True)

In [None]:
dfes = {}
for id_, dfg in tqdm(df_elt.groupby('chart_id')):
    dfes[id_] = dfg.reset_index(drop=True)

In [None]:
PLOT = False
SAVE = True

In [None]:
df['split'] = df['source'].map({"generated": "train", "extracted": "val"})

In [None]:
labels = ["chart", "text", "tick", "point"]

for i, (id_, dfg) in tqdm(enumerate(df.groupby('id')), total=len(df)):    
#     id_ = 'e93bed1228d6'
#     dfg = df[df['id'] == id_]

    img_file = f'../input/train/images/{id_}.jpg'
    src = dfg['source'].values[0]
    split = dfg['split'].values[0]

    if split == 'train':
        yolo_img_dir = yolo_train_img_dir
        yolo_label_dir = yolo_train_label_dir
    else:
        yolo_img_dir = yolo_valid_img_dir
        yolo_label_dir = yolo_valid_label_dir

#     # Extract boxes
    try:
        dft = dfts[id_]
        dfe = dfes[id_]
    except KeyError:
#         print("Error")
        continue

    boxes = extract_bboxes_2(dfg, dft, dfe, dfg['img_h'].values[0], dfg['img_w'].values[0])
    boxes = [b for i, b in enumerate(boxes) if labels[i] in CLASSES]

    if SAVE:
        # Copy image
        dst_file = f'{yolo_img_dir}/{id_}.jpg'
        shutil.copyfile(img_file, dst_file)

        # Save boxes
        file_name = f'{yolo_label_dir}/{id_}.txt'
        assert len(boxes) == len(CLASSES)

        written = []
        with open(file_name, 'w') as f:
            for c, boxes_c in enumerate(boxes):
                for box in boxes_c:
                    str_bbox = ' '.join([str(c)] + [f"{b:.4g}" for b in box])
                    if str_bbox not in written:
                        f.write(str_bbox)
                        f.write('\n')
                        written.append(str_bbox)

                
    if PLOT or not (i % 10000):
        img = cv2.imread(img_file)
        plot_sample(img, boxes)
        plt.title(f"{id_} - {src} {dfg['chart-type'].values[0]}")
        plt.show()
    
#     if i >= 10:
#     break

#### Extra data

In [None]:
from util.boxes import Boxes
from util.plots import plot_results

In [None]:
CLASSES

In [None]:
mapping = {i: CLASSES.index(labels[i]) if labels[i] in CLASSES else -1 for i in range(len(labels))}
mapping

In [None]:
if USE_GENERATED:
    
    EXTRA_DATA_PATHS = [
        ("../input/scatter/imgs_r/", "../input/scatter/preds_final_2/"),
        ("../input/scatter/imgs_r_v2/", "../input/scatter/preds_v2_final/"),
        ("../input/scatter/imgs_r_v3/", "../input/scatter/preds_v3_final/"),
    ]

    for img_path, gt_path in EXTRA_DATA_PATHS:
        for file in tqdm(sorted(os.listdir(img_path))):
#             img = cv2.imread(img_path + file)
            shutil.copyfile(img_path + file, yolo_train_img_dir + file)
#             break

        for file in tqdm(sorted(os.listdir(gt_path))):
            with open(gt_path + file, 'r') as f:
                boxes = [b[:-1] for b in f.readlines()]
                classes = [int(b[:1]) for b in boxes]
                boxes = [b[1:] for b in boxes]
                
#             break
#             boxes_ = [b for b, c in zip(boxes, classes) if mapping[c] >= 0]
#             preds = Boxes(np.array([b.strip().split(' ') for b in boxes_]).astype(float), img.shape,)['pascal_voc']
#             plot_results(
#                 img,
#                 [[], [], [], preds],
#                 figsize=(12, 7),
#                 show=True
#             )
    
            with open(yolo_train_label_dir + file, 'w') as f:
                for c, box in zip(classes, boxes):
                    new_c = mapping[c]
                    if new_c == -1:
                        continue
                    str_bbox = str(new_c) + box
    #                 print(str_bbox)
                    f.write(str_bbox)
                    f.write('\n')

#### Data yaml

In [None]:
# Dump config file
data_yaml = dict(
    train=yolo_train_img_dir,
    val=yolo_valid_img_dir,
    nc=len(CLASSES),
    names=CLASSES
)

print(data_yaml)

with open(yaml_file, 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)

yaml_file

### To coco

In [None]:
from globox import AnnotationSet

#### Val

In [None]:
yolo = AnnotationSet.from_yolo_v5(
    folder=f"{DATA_PATH}/v{VERSION}/labels/valid/",
    image_folder=f"{DATA_PATH}/v{VERSION}/images/valid/"
)

In [None]:
yolo.show_stats()

In [None]:
os.makedirs(f'{DATA_PATH}/v{VERSION}/annotations/', exist_ok=True)
yolo.save_coco(f"{DATA_PATH}/v{VERSION}/annotations/val2017.json", auto_ids=True)

In [None]:
shutil.move(f"{DATA_PATH}/v{VERSION}/images/valid/", f"{DATA_PATH}/v{VERSION}/val2017")

#### Train

In [None]:
yolo = AnnotationSet.from_yolo_v5(
    folder=f"{DATA_PATH}/v{VERSION}/labels/train/",
    image_folder=f"{DATA_PATH}/v{VERSION}/images/train/"
)

In [None]:
yolo.show_stats()

In [None]:
yolo.save_coco(f"{DATA_PATH}/v{VERSION}/annotations/train2017.json", auto_ids=True)

In [None]:
shutil.move(f"{DATA_PATH}/v{VERSION}/images/train/", f"{DATA_PATH}/v{VERSION}/train2017")

In [None]:
len(os.listdir(f"{DATA_PATH}/v{VERSION}/train2017"))

In [None]:
len(os.listdir(f"{DATA_PATH}/v{VERSION}/val2017"))

In [None]:
for file in tqdm(glob.glob(f"{DATA_PATH}/v{VERSION}/train2017/*")):
    try:
        assert file.endswith('.jpg') or file.endswith('.png')
        assert cv2.imread(file) is not None
    except:
#         os.remove(file)
        print(file)

Done ! 