**About** : This notebook is used to train detection models.

In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [None]:
import os
import cv2
import sys
import ast
import glob
import json
import yaml
import shutil
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

In [None]:
from params import *
from util.plots import *
from inference.yolo import *
from util.metrics import *

In [None]:
VERSION = "v2"

### Utils

In [None]:
def post_process_preds(preds):
    assert VERSION != "v1"
    try:
        graph = preds[0][0]
    except:
        return preds
    
    # Points are inside the graph
    points = preds[3]
    margin = 10
    points = points[points[:, 0] > graph[0] - margin]
    points = points[points[:, 1] > graph[1] - margin]
    points = points[points[:, 2] < graph[2] + margin]
    points = points[points[:, 3] < graph[3] + margin]
    
    # Texts are below or left of the graph
    texts = preds[1]
    margin = 30
    texts = texts[
        (texts[:, 1] > graph[3] - margin) |  # left
        (texts[:, 0] < graph[0] + margin)    # bottom
    ]
#     texts = texts[
#         ((texts[:, 2] < graph[0]) & (texts[:, 3] > graph[1]) & (texts[:, 1] < graph[3])) |  # left
#         ((texts[:, 1] > graph[3]) & (texts[:, 2] > graph[0]) & (texts[:, 0] < graph[2]))    # bottom
#     ]
    
    # Ticks are on the axis
    ticks = preds[2]
#     margin = 10
#     ticks = ticks[
#         ((np.abs((ticks[:, 2] + ticks[:, 2]) / 2 - graph[0]) < margin) & (ticks[:, 3] > graph[1]) & (ticks[:, 1] < graph[3])) |  # left
#         ((np.abs((ticks[:, 1] + ticks[:, 3]) / 2 - graph[3]) < margin) & (ticks[:, 2] > graph[0]) & (ticks[:, 0] < graph[2]))    # bottom
#     ]
    
    
    return [preds[0], texts, ticks, points]

In [None]:
def my_assignment(mat):
    row_ind, col_ind = [], []
    for i in range(np.min(mat.shape)):
        row, col = np.unravel_index(np.argmin(mat), mat.shape)
        mat[row] = np.inf
        mat[:, col] = np.inf
        row_ind.append(row)
        col_ind.append(col)
        
    return row_ind, col_ind

In [None]:
import numpy as np

def assign(ticks, labels, tol=2, mode="x"):
    if mode == "x":
        labels_x, labels_y = (labels[:, 0] + labels[:, 2]) / 2, labels[:, 1]
    else:
        labels_x, labels_y = labels[:, 2], (labels[:, 1] + labels[:, 3]) / 2

    labels_xy = np.stack([labels_x, labels_y], -1)
#     print(labels_xy.shape)

    ticks_x, ticks_y = (ticks[:, 0] + ticks[:, 2]) / 2, (ticks[:, 1] + ticks[:, 3]) / 2
    ticks_xy = np.stack([ticks_x, ticks_y], -1)

#     print(ticks_xy.shape)
    
    cost_matrix = np.sqrt(((ticks_xy[:, None] - labels_xy[None]) ** 2).sum(-1))
    
#     print(np.min(cost_matrix))
    if mode == "x":  # penalize y_label < y_tick
        cost_matrix += ((ticks_y[:, None] - labels_y[None]) > 0) * np.min(cost_matrix) * tol
    else:  # penalize x_tick < x_label
        cost_matrix += ((ticks_x[:, None] - labels_x[None]) < 0) * np.min(cost_matrix) * tol
         
    row_ind, col_ind = my_assignment(cost_matrix.copy())
    
#     print(row_ind, col_ind)
    
    ticks_assigned, labels_assigned = [], []

    for tick_idx, label_idx in zip(row_ind, col_ind):
#         print(cost_matrix[tick_idx, label_idx])
        if cost_matrix[tick_idx, label_idx] < max(tol * 5, tol * np.min(cost_matrix)):
            ticks_assigned.append(ticks[tick_idx])
            labels_assigned.append(labels[label_idx])
            
    return np.array(ticks_assigned), np.array(labels_assigned)

In [None]:
def restrict_on_line(preds, margin=5, cat=False):
    try:
        graph = preds[0][0]
    except:
        return preds
    x_axis, y_axis = graph[0], graph[3]
    
    ticks = preds[2]
    ticks_x, ticks_y = (ticks[:, 0] + ticks[:, 2]) / 2, (ticks[:, 1] + ticks[:, 3]) / 2
    
#     print(x_axis, y_axis)
#     print(ticks_x)
#     print(ticks_y)
    
    dists_x = ticks_x - x_axis
    dists_y = ticks_y - y_axis
    
    best_x = dists_x[np.argmax([(np.abs(dists_x - d) < margin).sum() for d in dists_x])]
    best_y = dists_y[np.argmax([(np.abs(dists_y - d) < margin).sum() for d in dists_y])]
    
#     print(dists_x - best_x)
#     print(dists_y - best_y)
    y_ticks = ticks[np.abs(dists_x - best_x) < margin]  # similar x
    x_ticks = ticks[np.abs(dists_y - best_y) < margin]  # similar y
    
#     print(x_ticks)
    
    # Pair with labels
    labels = preds[1]    
    
    x_ticks, x_labels = assign(x_ticks.copy(), labels.copy())
    y_ticks, y_labels = assign(y_ticks.copy(), labels.copy(), mode="y")
    
    # Reorder
    order_x = np.argsort(x_ticks[:, 0])
    x_ticks = x_ticks[order_x]
    x_labels = x_labels[order_x]
    
    order_y = np.argsort(y_ticks[:, 1])[::-1]
    y_ticks = y_ticks[order_y]
    y_labels = y_labels[order_y]

    if not cat:
        return [preds[0], x_labels, y_labels, x_ticks, y_ticks, preds[3]]
    
    labels = np.unique(np.concatenate([x_labels, y_labels]), axis=0)
    ticks = np.unique(np.concatenate([x_ticks, y_ticks]), axis=0)
    
    return [preds[0], labels, ticks, preds[3]]
    

### Load data

In [None]:
df = pd.read_csv('../input/df_train.csv')
df_text = pd.read_csv('../input/texts.csv')
df_target = pd.read_csv('../input/y_train.csv')
df_elt = pd.read_csv('../input/elements.csv')

In [None]:
df = df[~df['id'].isin(ANOMALIES)].reset_index(drop=True)

In [None]:
df_split = pd.read_csv('../input/df_split.csv')
df = df.merge(df_split)

In [None]:
if VERSION == "v2":
    CLASSES = [
        "dot",
        "line",
        "scatter",
    ]

    df = df[df['chart-type'].isin(CLASSES)].reset_index(drop=True)

### Model

In [None]:
class Config:
    selected_model = "yolo"
    bbox_format = "yolo"
    pred_format = "pascal_voc"

    weights = "/workspace/kaggle_benetech/logs/yolov7x-w6-v2.5/weights/best.pt"
#     weights = "/workspace/kaggle_benetech/logs/yolov7x-e6-v2./weights/best.pt"

#     size = (512, 512)
    size = (640, 640)

    # NMS
    conf_thresh = [0.1, 0.4, 0.2, 0.5]  # todo : per class
    max_per_img = 500
    min_per_img = 0
    iou_thresh = [0.5, 0.25, 0.25, 0.75]

    val_bs = 16
    device = "cuda"

In [None]:
model = retrieve_model(Config)

### Evaluate

In [None]:
chart_types = [
#     "dot",
#     "line",
#     "vertical_bar",
#     "horizontal_bar",
    "scatter",
]

if VERSION == "v1":
    classes = ["x_text", "y_text", "x_tick", "y_tick", "point", "bar"]
else:
    classes = ['chart', 'text', 'tick', 'point']

In [None]:
df_val = df[df['split'] == "val"].reset_index(drop=True)
df_val['path'] = f'../input/{VERSION}/images/valid/' + df_val['id'] + '.jpg'
df_val['gt_path'] = f'../input/{VERSION}/labels/valid/' + df_val['id'] + '.txt'
df_val_ = df_val.copy()

for t in chart_types:
    print(f'\n-> Chart type : {t}\n')
    df_val = df_val_[df_val_['chart-type'] == t].reset_index(drop=True)

    transforms = get_transfos(size=Config.size)
    dataset = InferenceDataset(df_val, transforms)
    
    meter = predict(model, dataset, Config)
    for i, p in enumerate(meter.preds):
        p.update_shape((df_val['img_h'][i], df_val['img_w'][i]))

    scores = {c: [] for c in classes}
    for idx in tqdm(range(len(dataset))):
        img, gt, shape = dataset[idx]

        gt = Boxes(gt, (shape[0], shape[1]), bbox_format="yolo")['pascal_voc']
        gt = [gt[dataset.classes[idx] == i] for i in range(len(classes))]
        preds = [meter.preds[idx]['pascal_voc'][meter.labels[idx] == i] for i in range(len(classes))]
        
        preds = post_process_preds(preds)

        for i, (t, p) in enumerate(zip(gt, preds)):
            metrics = compute_metrics(p, t)
            scores[classes[i]].append(metrics['f1_score'])
    #         print(classes[i], metrics['f1_score'])
    #     print()
    #     if idx == 1:
    #         break
    for k, v in scores.items():
        print(f'{k} \t Avg F1: {np.mean(v):.3f}  \t Avg F1==1: {np.mean(np.array(v) == 1):.3f}')
#         break
#     break

### Predict
- IoU per class
- merge xticks and yticks (/labels)
- train without bars

In [None]:
df_val = df[df['split'] == "val"].reset_index(drop=True)
df_val['path'] = '../input/v1/images/valid/' + df_val['id'] + '.jpg'
df_val['gt_path'] = '../input/v1/labels/valid/' + df_val['id'] + '.txt'

In [None]:
TYPES = [
#     "dot",
#     "line",
#     "vertical_bar",
#     "horizontal_bar",
    "scatter",
]

df_val = df_val[df_val['chart-type'].isin(TYPES)].reset_index(drop=True)
# df_val = df_val[df_val['source'] == "extracted"].reset_index(drop=True)

In [None]:
transforms = get_transfos(size=Config.size)
dataset = InferenceDataset(df_val, transforms)

In [None]:
%%time
meter = predict(model, dataset, Config)

for i, p in enumerate(meter.preds):
    p.update_shape((df_val['img_h'][i], df_val['img_w'][i]))

In [None]:
dataset = InferenceDataset(df_val, None)

### OCR

In [None]:
import transformers
transformers.utils.logging.set_verbosity_error()

from transformers import TrOCRProcessor
from transformers import VisionEncoderDecoderModel

from util.boxes import expand_boxes

In [None]:
name = "microsoft/trocr-base-stage1"

processor = TrOCRProcessor.from_pretrained(name)
ocr_model = VisionEncoderDecoderModel.from_pretrained(name).cuda()

In [None]:
def ocr(model, processor, img, boxes, plot=False, margin=0):
    inputs, crops = [], []
    for box in boxes:
#         if box[3] - box[1] < 5 and not margin:  # too small !
#             margin = 1
        y0, y1 = max(box[1] - margin, 0), min(img.shape[0], box[3] + margin)
#         margin = 0
        
#         if box[2] - box[0] < 5 and not margin:  # too small !
#             margin = 1
        x0, x1 = max(box[0] - margin, 0), min(img.shape[1], box[2] + margin)
#         margin = 0

        crop = img[y0: y1, x0: x1]
        crops.append(crop)
        img_p = processor(crop, return_tensors="pt").pixel_values.cuda()
        inputs.append(img_p)

    generated_ids = model.generate(torch.cat(inputs, 0))
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    if plot:
        plt.figure(figsize=(15, 5))
        for i, box in enumerate(boxes):
            plt.subplot(1, len(boxes) , i + 1)
            plt.imshow(crops[i])
            plt.title(generated_texts[i])
            plt.axis(False)
        plt.show()
        
    return generated_texts

In [None]:
import re

def post_process_texts(texts):
    """
    TODO : fractions, powers
    B, M, K suffixes
    
    """
    values, errors = [], []
    for i, t in enumerate(texts):
        # Oo -> 0
        t = re.sub('O', "0", t)
        t = re.sub('o', "0", t)
        t = re.sub('o', "0", t)
        
        # No numeric ?
        if not any(c.isnumeric() for c in t):
            errors.append(i)
            continue

        # Prefixes or suffixes 
        while not (t[0].isnumeric() or t[0] == "-" or t[0] == "."):
            t = t[1:]
            if not len(t):
                break
        if len(t):
            while not t[-1].isnumeric():
                t = t[:-1]

        # Handle .,
        if "," in t or "." in t:
            if all([len(char) == 3 for char in t.split(',')][1:]):
#                 print('rep ,')
                t = re.sub('\,', "", t)
            if all([len(char) == 3 for char in t.split('.')][1:]):
#                 print('rep .')
                t = re.sub('\.', "", t)

        if len(t):
            try:
#                 print(float(t))
                values.append(float(t))
            except:
    #             print(f"Error with char {texts[i]}")
                errors.append(i)
        else:
            errors.append(i)
    
    assert len(values) + len(errors) == len(texts)
    return np.array(values), errors

In [None]:
def find_outliers(x_train, values, verbose=0):
    corr_rank = np.abs(spearmanr(x_train, values).statistic)
    
    if corr_rank > 0.99:
        return []
    
    # One outlier
    for i in range(len(x_train)):
        x_train_ = [x for j, x in enumerate(x_train) if j != i]
        values_ = [v for j, v in enumerate(values) if j != i]
        corr_rank = np.abs(spearmanr(x_train_, values_).statistic)
        
        if corr_rank > 0.99:
            if verbose:
                print(f'Remove {i}')
            return [i]
        
    # Two outliers
    for i in range(len(x_train)):
        for i2 in range(i):
            x_train_ = [x for j, x in enumerate(x_train) if (j != i and j != i2)]
            values_ = [v for j, v in enumerate(values) if (j != i and j != i2)]
            corr_rank = np.abs(spearmanr(x_train_, values_).statistic)

            if corr_rank > 0.99:
                if verbose:
                    print(f'Remove {i}, {i2}')
                return [i, i2]
            
    return []

In [None]:
def longest_increasing_subset(lst):
    n = len(lst)
    if n == 0:
        return []
    
    # Initialize the lengths and previous indices
    lengths = [1] * n
    previous_indices = [-1] * n

    # Iterate over the list and update the lengths and previous indices
    for i in range(1, n):
        for j in range(i):
            if lst[i] > lst[j] and lengths[i] < lengths[j] + 1:
                lengths[i] = lengths[j] + 1
                previous_indices[i] = j
    
    # Find the index of the longest increasing subsequence
    max_length_index = max(range(n), key=lambda x: lengths[x])

    # Reconstruct the longest increasing subsequence
    result = []
    while max_length_index != -1:
        result.append(lst[max_length_index])
        max_length_index = previous_indices[max_length_index]
    
    return result[::-1]

In [None]:
def find_outliers_order(values, verbose=0):
    ref = np.arange(len(values))
    sort = np.argsort(values)
    
    # Correct order
    if (ref == sort).all() or (ref[::-1] == sort).all():
        return []
    
    longest_inc = longest_increasing_subset(sort)
    longest_dec = longest_increasing_subset(sort[::-1])
    
#     print(longest_inc, longest_dec)
    
    if len(longest_inc) >= len(longest_dec):
        return [i for i in sort if i not in longest_inc]
    else:
        return [i for i in sort if i not in longest_dec]

In [None]:
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr, spearmanr


def linear_regression(ticks, values, errors, points, mode="x", verbose=0):
    if len(np.unique(values)) == 1:
        return [values[0] for _ in range(len(points))]
    elif len(values) == 0:
        return [0 for _ in range(len(points))]
    
    ticks = np.array([t for i, t in enumerate(ticks) if i not in errors])
    
    if mode == "x":
        x_test = (points[:, 0] + points[:, 2]) / 2
        x_train = (ticks[:, 0] + ticks[:, 2]) / 2
    else:
        x_test = (points[:, 1] + points[:, 3]) / 2
        x_train = (ticks[:, 1] + ticks[:, 3]) / 2  

    corr = np.abs(pearsonr(x_train, values).statistic)
    corr_rank = np.abs(spearmanr(x_train, values).statistic)
    
    if verbose:
        print("Correlations before pp", corr, corr_rank)
    
    outliers = find_outliers(x_train, values, verbose=verbose)
    x_train = np.array([x for j, x in enumerate(x_train) if j not in outliers])
    values = np.array([v for j, v in enumerate(values) if j not in outliers])
    
    outliers = find_outliers_order(values, verbose=verbose)
    x_train = np.array([x for j, x in enumerate(x_train) if j not in outliers])
    values = np.array([v for j, v in enumerate(values) if j not in outliers])
    
    corr = np.abs(pearsonr(x_train, values).statistic)
    corr_rank = np.abs(spearmanr(x_train, values).statistic)
    
    if verbose:
        print("Correlations after pp", corr, corr_rank)
    
    log = False
    if corr > 0.99:
        pass
    else:
        if corr_rank > 0.99 and np.min(values) > 0:
            corr_log = np.abs(pearsonr(x_train, np.log(values)).statistic)
            
#             print("log", corr_log)
            if corr_log > 0.99:
                log = True
                values = np.log(values)
                
    model = LinearRegression()
    
    model.fit(x_train[:, None], values)
    
    pred = model.predict(x_test[:, None])
    
    if log:
        pred = np.exp(pred)
    
#     print(x_test, pred)
    
    return pred

In [None]:
def rounding(x):
    thresholds = [40, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.0001]
    for i, threshold in enumerate(thresholds):
        if x > threshold:
            return i
    return 100

### Viz

In [None]:
# x = np.exp(np.arange(-3, 3))
# plt.plot(np.arange(-3, 3), x)
# plt.yscale('log')
# plt.grid()

In [None]:
PLOT = False

In [None]:
scores = []
for idx in range(len(dataset)):
#     idx = 53
    
    img, gt, _ = dataset[idx]

    id_ = df_val.id[idx]
    
    print(idx, id_, end="\t")
    title = f"{id_} - {df_val.source[idx]} {df_val['chart-type'][idx]}"
    
    preds = [meter.preds[idx]['pascal_voc'][meter.labels[idx] == i] for i in range(len(classes))]
    preds = post_process_preds(preds)
    
    if PLOT:
        plot_results(img, preds, figsize=(12, 7), title=title)

    margin = (img.shape[0] + img.shape[1]) / (2 * 20)
    preds = restrict_on_line(preds, margin=margin)
        
    if PLOT:
        plot_results(img, preds, figsize=(12, 7), title=title)
    
#     break

#     print('Target')
#     display(df_target[df_target['id'] == df_val.id[idx]][["x", "y"]])

    # OCR
    x_texts = ocr(ocr_model, processor, img, preds[1], margin=1, plot=PLOT)
    x_values, x_errors = post_process_texts(x_texts)

    if PLOT:
        print("x labels :", x_values, " - errors:", x_errors)
#     print(x_values)
#     print(preds[3])
    
    reg_x = linear_regression(preds[3], x_values, x_errors, preds[-1], mode="x", verbose=PLOT)

    y_texts = ocr(ocr_model, processor, img, preds[2], margin=3, plot=PLOT)
    y_values, y_errors = post_process_texts(y_texts)

    if PLOT:
         print("y labels :", y_values, " - errors:", y_errors)
    
    reg_y = linear_regression(preds[4], y_values, y_errors, preds[-1], mode="y", verbose=PLOT)
    
    gt = df_target[df_target['id'] == id_].reset_index(drop=True)
    gt[["x", "y"]] = gt[["x", "y"]].astype(float)
    gt = gt.sort_values(['x', 'y'], ignore_index=True)
    
    reg_x = np.round(reg_x, rounding(np.max(reg_x)))
    pred = pd.DataFrame({"x": reg_x, "y": reg_y})
    pred = pred.sort_values(['x', 'y'], ignore_index=True)
    
    score_x = score_series(gt['x'].values, pred['x'].values)
    score_y = score_series(gt['y'].values, pred['y'].values)

    print(f"Scores  -  x: {score_x:.3f}  - y: {score_y:.3f}")
    
    scores += [score_x, score_y]

    if PLOT:
        print('GT')
        display(gt)
        print('PRED')
        display(pred)

#     if idx >= 10:
#     break

In [None]:
print(f'Scatter CV : {np.mean(scores) :.3f}')

Done ! 