**About** : This notebook is used to train detection models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
import os
import cv2
import sys
import ast
import glob
import json
import yaml
import shutil
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [None]:
from params import *
from util.plots import *
from inference.yolo import *
from util.metrics import *

from post_process.retrieve import retrieve_missing_boxes
from post_process.reg import rounding, linear_regression
from post_process.ticks import restrict_on_line, assign
from post_process.in_graph import post_process_preds

In [None]:
VERSION = "v2"

### Load data

In [None]:
CLASSES = [
    "dot",
    "line",
    "scatter",
]

In [None]:
images = glob.glob('../input/scatter/imgs/*.jpg')

In [None]:
df = pd.DataFrame({"path": images})

In [None]:
df['id'] = df['path'].apply(lambda x: x[:-4].split('/')[-1])

In [None]:
df['gt_path'] = "../input/scatter/labels/" + df['id'] + ".txt"

In [None]:
# df['missing'] = df['gt_path'].apply(lambda x: not(os.path.exists(x)))
# df['missing'].sum()

# for img in tqdm(df[df['missing']]['path']):
#     shutil.copyfile(img, "../input/scatter/imgs_m/" + img.split('/')[-1])

In [None]:
df['coords_path'] = df['path'].apply(lambda x: x[:-4] + ".csv")

# df['missing'] = df['coords_path'].apply(lambda x: not(os.path.exists(x)))
# df['missing'].sum()

In [None]:
df_val = df.copy()  # .head(10)

In [None]:
df_val = df_val.sort_values('id').reset_index(drop=True)

### Model

In [None]:
class Config:
    selected_model = "yolo"
    bbox_format = "yolo"
    pred_format = "pascal_voc"

    weights = "/workspace/kaggle_benetech/logs/yolov7x-w6-v2.5/weights/best.pt"
#     weights = "/workspace/kaggle_benetech/logs/yolov7x-e6-v2./weights/best.pt"

#     size = (512, 512)
    size = (640, 640)

    # NMS
    conf_thresh = [0.5, 0.2, 0.2, 0.5]  # todo : per class
    max_per_img = 500
    min_per_img = 0
    iou_thresh = [0.5, 0.25, 0.25, 0.75]

    val_bs = 16
    device = "cuda"

In [None]:
model = retrieve_model(Config)

### Evaluate

In [None]:
classes = ['chart', 'text', 'tick', 'point']

### Predict

In [None]:
transforms = get_transfos(size=Config.size)
dataset = InferenceDataset(df_val, transforms)

In [None]:
# %%time
# meter = predict(model, dataset, Config)

# for i, p in enumerate(meter.preds):
#     p.update_shape(tuple(dataset[i][-1][:2]))

In [None]:
# PLOT = False

In [None]:
# shutil.rmtree('../input/scatter/preds/')
# os.makedirs('../input/scatter/preds/')

# with open("../input/scatter/preds/labels.txt", 'w') as f:
#     for c in classes:
#         f.write(c)
#         f.write('\n')

# classes

In [None]:
dataset = InferenceDataset(df_val, None)

In [None]:
# scores = []
# for idx in tqdm(range(len(dataset)), disable=PLOT):
# #     idx = 94
# #     PLOT = False

#     img, gt, _ = dataset[idx]

#     id_ = df_val.id[idx]

# #     print(idx, id_, end="\t")
#     title = f"{id_}"
    
#     preds = [meter.preds[idx]['pascal_voc'][meter.labels[idx] == i] for i in range(len(classes))]
    
#     if PLOT:
#         plot_results(img, preds, figsize=(12, 7), title=title)

#     preds = [meter.preds[idx]['yolo'][meter.labels[idx] == i] for i in range(len(classes))]
        
#     file_name = re.sub("/imgs/", "/preds/", df_val['path'][idx][:-4]) + ".txt"
#     with open(file_name, 'w') as f:
#         for c, boxes_c in enumerate(preds):
#             for box in boxes_c:
#                 if c in [0, 3]:
#                     continue
#                 str_bbox = ' '.join([str(c)] + [f"{b:.4g}" for b in box])
#                 f.write(str_bbox)
#                 f.write('\n')
                
#                 if c == 0:
#                     break

## Annots

### OCR

In [None]:
import transformers
transformers.utils.logging.set_verbosity_error()

from transformers import TrOCRProcessor
from transformers import VisionEncoderDecoderModel

from util.boxes import expand_boxes
from util.ocr import *

In [None]:
name = "microsoft/trocr-base-stage1"

processor = TrOCRProcessor.from_pretrained(name)
ocr_model = VisionEncoderDecoderModel.from_pretrained(name).cuda()

In [None]:
dataset = InferenceDataset(df_val, None)

In [None]:
def restrict_on_line(preds, margin_x=5, margin_y=5, cat=False):
    try:
        graph = preds[0][0]
        x_axis, y_axis = graph[0], graph[3]
    except Exception:
        x_axis, y_axis = 0, 0
    
    for i in [2, 1]:
#         print(i)
        ticks = preds[i]
        if i == 2:
            ticks_x, ticks_y = (ticks[:, 0] + ticks[:, 2]) / 2, (ticks[:, 1] + ticks[:, 3]) / 2
        else:
            ticks_x, ticks_y = ticks[:, 2], ticks[:, 1]

        dists_x = ticks_x - x_axis
        dists_y = ticks_y - y_axis
        
        ys = []
        margin_x_ = margin_x
        while len(ys) < 2 and margin_x < 100:
            best_x = dists_x[np.argmax([(np.abs(dists_x - d) < margin_x_).sum() for d in dists_x])]
            ys = ticks[np.abs(dists_x - best_x) < margin_x_] 
            margin_x_ += 1
            
        xs = []
        margin_y_ = margin_y
        while len(xs) < 2 and margin_y_ < 100:
            best_y = dists_y[np.argmax([(np.abs(dists_y - d) < margin_y_).sum() for d in dists_y])]
            xs = ticks[np.abs(dists_y - best_y) < margin_y]
            margin_y_ += 1

        if i == 1:
            y_labels = ys.copy()
            x_labels = xs.copy()
        else:
            y_ticks = ys.copy()
            x_ticks = xs.copy()
            
    return [preds[0], x_labels, y_labels, x_ticks, y_ticks, preds[3]]

In [None]:
from sklearn.linear_model import LinearRegression


def my_assignment(mat):
    row_ind, col_ind = [], []
    for i in range(np.min(mat.shape)):
        row, col = np.unravel_index(np.argmin(mat), mat.shape)
        mat[row] = np.inf
        mat[:, col] = np.inf
        row_ind.append(row)
        col_ind.append(col)

    return row_ind, col_ind


def assign(ticks, labels, tol=10, mode="x", retrieve_missing=False, verbose=0):
    if mode == "x":
        labels_x, labels_y = (labels[:, 0] + labels[:, 2]) / 2, labels[:, 1]
    else:
        labels_x, labels_y = labels[:, 2], (labels[:, 1] + labels[:, 3]) / 2

    labels_xy = np.stack([labels_x, labels_y], -1)
    #     print(labels_xy.shape)

    ticks_x, ticks_y = (ticks[:, 0] + ticks[:, 2]) / 2, (ticks[:, 1] + ticks[:, 3]) / 2
    ticks_xy = np.stack([ticks_x, ticks_y], -1)

    #     print(ticks_xy.shape)

    cost_matrix = np.sqrt(((ticks_xy[:, None] - labels_xy[None]) ** 2).sum(-1))

    #     print(np.min(cost_matrix))
    if mode == "x":  # penalize y_label < y_tick
        cost_matrix += (
            ((ticks_y[:, None] - labels_y[None]) > 0) * np.min(cost_matrix) * tol
        )
    else:  # penalize x_tick < x_label
        cost_matrix += (
            ((ticks_x[:, None] - labels_x[None]) < 0) * np.min(cost_matrix) * tol
        )

    row_ind, col_ind = my_assignment(cost_matrix.copy())

    #     print(row_ind, col_ind)

    ticks_assigned, labels_assigned = [], []
    assigned_label_ids = []

    for tick_idx, label_idx in zip(row_ind, col_ind):
        if cost_matrix[tick_idx, label_idx] < max(tol * 5, tol * np.min(cost_matrix)):
            ticks_assigned.append(ticks[tick_idx])
            assigned_label_ids.append(label_idx)
            labels_assigned.append(labels[label_idx])
            
    if not retrieve_missing or not (len(labels) - len(assigned_label_ids)):
        return np.array(ticks_assigned), np.array(labels_assigned)
    
    ticks_assigned = np.array(ticks_assigned)
    labels_assigned = np.array(labels_assigned)
    
    unassigned = np.array(labels[[i for i in range(len(labels)) if i not in assigned_label_ids]])
    if verbose:
        print("Retrieve ", len(unassigned), mode)

    if mode == "x":
        x_test = (unassigned[:, 0] + unassigned[:, 2]) / 2
        x_train = (labels_assigned[:, 0] + labels_assigned[:, 2]) / 2
        y_train = (ticks_assigned[:, 0] + ticks_assigned[:, 2]) / 2
    else:
        x_test = (unassigned[:, 1] + unassigned[:, 3]) / 2
        x_train = (labels_assigned[:, 1] + labels_assigned[:, 3]) / 2
        y_train = (ticks_assigned[:, 1] + ticks_assigned[:, 3]) / 2

    model = LinearRegression()
    model.fit(x_train[:, None], y_train)
    pred = model.predict(x_test[:, None])[:, None]
    
#     print(x_train, y_train)
#     print(x_test, pred)

    # Average ticks
    xc = ((ticks_assigned[:, 0] + ticks_assigned[:, 2]) / 2).mean(0, keepdims=True)[None].repeat(len(pred), 0)
    yc = ((ticks_assigned[:, 1] + ticks_assigned[:, 3]) / 2).mean(0, keepdims=True)[None].repeat(len(pred), 0)
    w = (ticks_assigned[:, 2] - ticks_assigned[:, 0]).mean(0, keepdims=True)[None].repeat(len(pred), 0)
    h = (ticks_assigned[:, 3] - ticks_assigned[:, 1]).mean(0, keepdims=True)[None].repeat(len(pred), 0)

    # Replace with preds
    if mode == "x":
        xc = pred
    else:
        yc = pred

    retrieved = np.concatenate([xc - w // 2, yc - h // 2, xc + w // 2, yc + h // 2], 1).astype(int)
    ticks_assigned = np.concatenate([ticks_assigned, retrieved])
    labels_assigned = np.concatenate([labels_assigned, unassigned])

    return np.array(ticks_assigned), np.array(labels_assigned)

In [None]:
def update_and_reorder(preds, x_ticks, x_labels, y_ticks, y_labels, cat=False):
        
    # Reorder
    order_x = np.argsort(x_ticks[:, 0])
    x_ticks = x_ticks[order_x]
    x_labels = x_labels[order_x]

    order_y = np.argsort(y_ticks[:, 1])[::-1]
    y_ticks = y_ticks[order_y]
    y_labels = y_labels[order_y]

    if not cat:
        return [preds[0], x_labels, y_labels, x_ticks, y_ticks, preds[-1]]

    labels = np.unique(np.concatenate([x_labels, y_labels]), axis=0)
    ticks = np.unique(np.concatenate([x_ticks, y_ticks]), axis=0)

    return [preds[0], labels, ticks, preds[3]]

In [None]:
def approx_chart(x_ticks, y_ticks):
    x_min = np.min(x_ticks[:, 0])
    x_max = np.max(x_ticks[:, 2])
    y_min = np.min(y_ticks[:, 1])
    y_max = np.max(y_ticks[:, 3])
    
    return np.array([[x_min, y_min, x_max, y_max]])

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr, spearmanr

from post_process.outliers import find_outliers, find_outliers_order


def linear_regression(ticks, values, errors, points, mode="x", verbose=0):
    warning = False
    if len(np.unique(values)) == 1:
        print('Warning, 1 ticks on axis', mode)
        return [values[0] for _ in range(len(points))], True
    elif len(values) == 0:
        print('Warning, 0 ticks on axis', mode)
        return [0 for _ in range(len(points))], True

    ticks = np.array([t for i, t in enumerate(ticks) if i not in errors])

    if mode == "x":
        y_train = (ticks[:, 0] + ticks[:, 2]) / 2
    else:
        y_train = (ticks[:, 1] + ticks[:, 3]) / 2

    corr = np.abs(pearsonr(y_train, values).statistic)
    corr_rank = np.abs(spearmanr(y_train, values).statistic)
    
    if corr < 0.9999:

        if verbose:
            print("Correlations before pp", corr, corr_rank)

        outliers = find_outliers(y_train, values, verbose=verbose, corr="pearson", th=0.9999)
        y_train = np.array([x for j, x in enumerate(y_train) if j not in outliers])
        values = np.array([v for j, v in enumerate(values) if j not in outliers])

        outliers = find_outliers_order(values, verbose=verbose)
        y_train = np.array([x for j, x in enumerate(y_train) if j not in outliers])
        values = np.array([v for j, v in enumerate(values) if j not in outliers])

        corr = np.abs(pearsonr(y_train, values).statistic)
        corr_rank = np.abs(spearmanr(y_train, values).statistic)

        if verbose:
            print("Correlations after pp", corr, corr_rank)

    if corr < 0.9999:
        print(f'Warning, corr={corr:.4f}')
        warning = True
        
    if len(np.unique(values)) <= 1 or len(np.unique(y_train)) <= 1:
        print(f'Warning, not enough unique values')
        warning = True
        
    print("train", values, y_train, corr)

    model = LinearRegression()
    model.fit(values[:, None], y_train)
    pred = model.predict(points)

    return pred, warning


In [None]:
dataset = InferenceDataset(df_val, None)

In [None]:
# shutil.rmtree('../input/scatter/preds_final/')
# os.makedirs('../input/scatter/preds_final/')

# with open("../input/scatter/preds_final/labels.txt", 'w') as f:
#     for c in classes:
#         f.write(c)
#         f.write('\n')

# classes

In [None]:
PLOT = False
DEBUG = True
SAVE = False

In [None]:
scores = []
for idx in tqdm(range(len(dataset)), disable=True):
    idx = 4
    DEBUG = True

    img, gt, shape = dataset[idx]

    id_ = df_val.id[idx]
    title = f"{id_}"
    
    print("\n", idx, id_, end="\t")
    
    preds = [gt[dataset.classes[idx] == i] for i in range(len(classes))]
    preds = [Boxes(p, shape)['pascal_voc'] for p in preds]
    
    if DEBUG:
        plot_results(img, preds, figsize=(12, 7), title=title)
        
    margin_x = img.shape[1] / 50
    margin_y = img.shape[0] / 50
    preds = restrict_on_line(preds, margin_x=margin_x, margin_y=margin_y)
    
    if DEBUG:
        plot_results(img, preds, figsize=(12, 7), title=title)
    
    x_ticks, x_labels = assign(preds[3].copy(), preds[1].copy(), retrieve_missing=True, verbose=DEBUG,)
    y_ticks, y_labels = assign(preds[4].copy(), preds[2].copy(), retrieve_missing=True, verbose=DEBUG, mode="y")
    
    preds = update_and_reorder(preds, x_ticks, x_labels, y_ticks, y_labels)
    
#     if PLOT:
#         plot_results(img, preds, figsize=(12, 7), title=title)

    coords = pd.read_csv(df_val['coords_path'][idx], header=None).values

    x_texts = ocr(ocr_model, processor, img, preds[1], margin=1, plot=False)
    x_values, x_errors = post_process_texts(x_texts)
    if DEBUG:
        print("x_labels", x_values)
    reg_x, warn_x = linear_regression(preds[3], x_values, x_errors, coords[:, :1], mode="x", verbose=0)
    
    if warn_x:
        continue
    
    y_texts = ocr(ocr_model, processor, img, preds[2], margin=1, plot=False)
    y_values, y_errors = post_process_texts(y_texts)
    if DEBUG:
        print("y_labels", y_values)
    reg_y, warn_y = linear_regression(preds[4], y_values, y_errors, coords[:, 1:], mode="y", verbose=0)
    
    if warn_y:
        continue

    hw = 8
    preds[-1] = np.concatenate(
        [reg_x[:, None] - hw, reg_y[:, None] - hw, reg_x[:, None] + hw, reg_y[:, None] + hw],
        axis=1
    ).astype(int)
    
    preds[0] = approx_chart(preds[3], preds[4])
    
    if PLOT or DEBUG:
        plot_results(img, preds, figsize=(12, 7), title=title)

    if warn_y or warn_x:
        continue
        
    if SAVE:
        file_name = re.sub("/imgs/", "/preds_final/", df_val['path'][idx][:-4]) + ".txt"
        with open(file_name, 'w') as f:
            for c, boxes_c in enumerate(preds):
                boxes_c = Boxes(boxes_c, shape, bbox_format="pascal_voc")["yolo"]
                for box in boxes_c:
                    str_bbox = ' '.join([str(c)] + [f"{b:.4g}" for b in box])
                    f.write(str_bbox)
                    f.write('\n')
                    if c == 0:
                        break
        
    # TODO: convert to yolo and save
    if DEBUG:
        break

### Save lower dim images

In [None]:
import albumentations as albu

In [None]:
transforms = albu.Compose([
    albu.LongestMaxSize(512, always_apply=True),
    albu.ImageCompression(quality_lower=50, quality_upper=90, always_apply=True),
#     albu.Resize(640, 640, always_apply=True),
])

In [None]:
df_val['gt_path'] = df_val['gt_path'].apply(lambda x: re.sub("/labels/", "/preds_final/", x))

In [None]:
# dataset = InferenceDataset(df_val, None)
dataset = InferenceDataset(df_val, transforms)

In [None]:
SAVE = True

In [None]:
if SAVE:
    shutil.rmtree('../input/scatter/imgs_r/')
    os.makedirs('../input/scatter/imgs_r/')

In [None]:
anomalies = pd.read_csv('../input/scatter/anomalies.csv', header=None)[0].values

In [None]:
scores = []
for idx in tqdm(range(len(dataset)), disable=False):
    img_name = df_val['path'].values[idx]
    label_name = re.sub("/imgs/", "/preds_final/", img_name[:-4]) + ".txt"

    if img_name.split('/')[-1] in anomalies:
        if os.path.exists(label_name):
            os.remove(label_name)
        continue

    if not os.path.exists(label_name):
        continue

    img, gt, shape = dataset[idx]
    
#     preds = [gt[dataset.classes[idx] == i] for i in range(6)]
#     preds = [Boxes(p, shape)['pascal_voc'] for p in preds]
#     plot_results(
#         img,
#         preds,
#         figsize=(12, 7),
#         title=title,
#         save_file="",  # re.sub("/imgs/", "/imgs_r/",df_val['path'].values[idx]),
#         show=True
#     )

#     plt.figure(figsize=(10, 10))
#     plt.imshow(img)
#     plt.axis(False)
#     plt.show()
    
    if SAVE:
        cv2.imwrite(re.sub("/imgs/", "/imgs_r/", df_val['path'].values[idx]), img)
        
#     break

In [None]:
len(os.listdir("../input/scatter/imgs_r"))

In [None]:
# x = np.exp(np.arange(-3, 3))
# plt.plot(np.arange(-3, 3), x)
# plt.yscale('log')
# plt.grid()

### Visualize final results

In [None]:
df_final = df_val.copy()

In [None]:
df_final['path'] = df_final['path'].apply(lambda x: re.sub("/imgs/", "/imgs_r/", x))
df_final = df_final[df_final['path'].apply(os.path.exists)].reset_index(drop=True)

In [None]:
df_final['gt_path'] = df_final['gt_path'].apply(lambda x: re.sub("/preds_final/", "/preds_final_2/", x))

In [None]:
print(f"Generated {len(df_final)} images")

In [None]:
dataset = InferenceDataset(df_final, None)

In [None]:
for idx in tqdm(range(len(dataset)), disable=False):
    img_name = df_val['path'].values[idx]

    img, gt, shape = dataset[idx]
    
    preds = [gt[dataset.classes[idx] == i] for i in range(4)]
    preds = [Boxes(p, shape)['pascal_voc'] for p in preds]
    plot_results(
        img,
        preds,
        figsize=(12, 7),
        title=title,
        save_file="",  # re.sub("/imgs/", "/imgs_r/",df_val['path'].values[idx]),
        show=True
    )
        
    break

### Fix labels

In [None]:
SAVE = True

In [None]:
scores = []
for idx in tqdm(range(len(dataset)), disable=False):
    img, gt, shape = dataset[idx]
    
    preds = [gt[dataset.classes[idx] == i] for i in range(6)]
#     preds = [Boxes(p, shape)['yolo'] for p in preds]
    
    preds = [
        preds[0],
        np.concatenate([preds[1], preds[2]]),
        np.concatenate([preds[3], preds[4]]),
        preds[5],
    ]

#     plot_results(
#         img,
#         preds,
#         figsize=(12, 7),
#         title=title,
#         save_file="",  # re.sub("/imgs/", "/imgs_r/", df_final['path'].values[idx]),
#         show=True
#     )

    if SAVE:
        file_name = re.sub("/imgs_r/", "/preds_final_2/", df_final['path'][idx][:-4]) + ".txt"
#         print(file_name)
        with open(file_name, 'w') as f:
            written = []
            for c, boxes_c in enumerate(preds):
#                 boxes_c = Boxes(boxes_c, shape, bbox_format="pascal_voc")["yolo"]
                for box in boxes_c:
                    str_bbox = ' '.join([str(c)] + [f"{b:.4g}" for b in box])
                    if str_bbox not in written:
                        f.write(str_bbox)
                        f.write('\n')
                        written.append(str_bbox)

        
    # TODO: convert to yolo and save
#     if DEBUG:
#     break

Done ! 