The kernel is similar in part to the kernel of the [excellent work](https://www.kaggle.com/code/antonoof/large-eda-and-statistical-model) by [expert Antonoof](https://www.kaggle.com/antonoof), where he conducts an analysis and supports it with a detailed EDA.
With this kernel we want to take a look at the system's behavior, basically this is the first glance.

#### Original.LB = 0.16

In [None]:
import os,numpy,pandas; from scipy.signal import butter,filtfilt

path   = '/kaggle/input/physionet-ecg-image-digitization/'

train  = pandas.read_csv     (path + 'train.csv')
test   = pandas.read_csv     (path + 'test.csv')
subm_1 = pandas.read_parquet (path + "sample_submission.parquet")

TRAIN_DIR =                   path + 'train/'

In [None]:
leads = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']

lead_templates, template_len = {}, 500

for lead in leads:
    PURE_signals = []
    for _, row in train.iterrows():
        csv_path = os.path.join(TRAIN_DIR, str(row['id']), f"{row['id']}.csv")
        if not os.path.exists(csv_path): continue
        try:
            #----------------------------------------------
            df = pandas.read_csv(csv_path)
            if lead not in df.columns: continue 
            #----------------------------------------------
            s = df[lead].dropna().values.astype(numpy.float32)
            if len(s) < 50: continue
            #----------------------------------------------
            s_norm = (s - s.mean()) / (s.std() + 1e-8)
            s_resamp = numpy.interp(
                numpy.linspace(0, 1, template_len),
                numpy.linspace(0, 1, len(s_norm)), s_norm 
            )
            PURE_signals.append(s_resamp)
            #----------------------------------------------
        except: continue
        #----------------------------------------------
    if PURE_signals:
        lead_templates[lead] = numpy.mean(PURE_signals, axis=0)
    else:
        t = numpy.linspace(0, 1, template_len)
        lead_templates[lead] = numpy.sin(2 * numpy.pi * t)
    #----------------------------------------------

In [None]:
predictions, min_val, max_val = {}, 0.0, 0.07

for _, row in test.iterrows():
    
    base_id, lead, n_rows  = row['id'],row['lead'],row['number_of_rows']

    fs = row.get('fs', 500)
    
    template = lead_templates.get(lead, lead_templates['II']).copy()
    
    if len(template) != n_rows:
        signal = numpy.interp(
            numpy.linspace(0, 1, n_rows),
            numpy.linspace(0, 1, len(template)),
            template
        )
    else:
        signal = template
    
    if len(signal) > 10:
        normal_cutoff = min(30*fs, 0.99)
        b, a = butter(2, normal_cutoff, btype='low')
        signal = filtfilt(b, a, signal)
    
    s_min, s_max = signal.min(), signal.max()
    
    if s_max - s_min < 1e-8:
        signal = numpy.full(n_rows, (min_val + max_val) / 2)
    else:
        signal = (signal - s_min) / (s_max - s_min)
        signal = min_val + signal * (max_val - min_val)
    
    predictions[(base_id, lead)] = signal.astype(numpy.float32)

data = []
for _, row in test.iterrows():
    _id,lead,n_rows = row['id'],row['lead'],row['number_of_rows']
    signal = predictions[(_id, lead)]
    for i in range(n_rows):
        data.append( { 'id':f"{_id}_{i}_{lead}", 'value':float(signal[i]) } )

subm_original1 = pandas.DataFrame(data)
subm_original1

#### Original.LB = 0.17

In [None]:
from scipy.signal import butter,filtfilt

import numpy as np, pandas as pd

subm_2 = pandas.read_parquet (path + "sample_submission.parquet")

leads = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']

template_len, lead_templates = 500, {}

for lead in leads:
    
    list_signalov = []
    
    for _, row in train.iterrows():
        
        csv_path = os.path.join(TRAIN_DIR, str(row['id']), f"{row['id']}.csv")
        
        if not os.path.exists(csv_path): continue
        
        try:
            df = pd.read_csv(csv_path)          
            if lead not in df.columns: continue
            
            s = df[lead].dropna().values.astype(np.float32)
            if len(s) < 50: continue
            
            norma = (s - s.mean()) / (s.std() + 1e-8)
            
            s_resamp =\
              np.interp(np.linspace(0,1,template_len), np.linspace(0,1,len(norma)), norma)
            
            list_signalov.append(s_resamp)
        except:
            continue
    
    if list_signalov:
        lead_templates[lead] = np.mean(list_signalov, axis=0)
    else:
        lead_templates[lead] = np.sin(2 * np.pi * np.linspace(0,1,template_len))
        

predictions, min_val, max_val = {}, 0.0, 0.09


for _, row in test.iterrows():
    
    base_id, lead, n_rows = row['id'],row['lead'],row['number_of_rows']

    fs = row.get('fs', 500)
    
    template = lead_templates.get(lead, lead_templates['II']).copy()
    
    if len(template) != n_rows:
        _1_signal =\
            np.interp(np.linspace(0, 1, n_rows), np.linspace(0, 1, len(template)), template)
    else:
        _1_signal = template

    
    if len(_1_signal)>10 :
        normal_cutoff = min      (30 * fs, 0.99)
        b,a           = butter   (2, normal_cutoff, btype='low')
        _1_signal     = filtfilt (b,a, _1_signal)

    
    _1s_min, _1s_Max = _1_signal.min(),_1_signal.max()

    
    if s_max - s_min < 1e-8:
        _1_signal = np.full(n_rows, (min_val + max_val) / 2)
    else:
        _1_signal = (_1_signal - _1s_min) / (_1s_Max - _1s_min)
        
        _1_signal = min_val + _1_signal * (max_val - min_val)
    
    predictions[(base_id, lead)] = _1_signal.astype(np.float32)


data = []


for _, row in test.iterrows():
    
    base_id, lead, n_rows = row['id'],row['lead'],row['number_of_rows']
    
    _1_signal = predictions[(base_id, lead)]
    
    for i in range(n_rows):
        data.append({'id': f"{base_id}_{i}_{lead}",'value': float(_1_signal[i])})

pd.set_option('display.float_format', '{:.15f}'.format)
            
subm_original2 = pd.DataFrame(data)
subm_original2 . to_csv('submission.csv', index=False)
subm_original2 . head(11)

In [None]:
subm_comparison = pd.merge(subm_original2, subm_original1, on='id')

subm_comparison['subm_2 - subm_1'] = subm_original2['value'] - subm_original1['value']

subm_comparison

In [None]:
import os,ast
import numpy as np
import pandas as pd

from bokeh.plotting import figure, gridplot 
from bokeh.io import output_file, show, output_notebook
output_notebook()

In [None]:
def bokeh_show(
        params,
        df_cross,
        colors, 
        show_figures1, 
        show_figures2, wps_fig2,
        color_cross):
    
    def dossier(js,subms,cols):
        def quant(i,js,subms,cols):
            return {"c" : i, "q" : sum([1 for subm in cols[i] if subm == subms[js]])}
        return {
            'name' : subms[js], 'color' : colors[js],
            'q_in' : [quant(i,js,subms,cols) for i in range(len(subms))]
        }
    alls = pd.read_csv(f'tida_desc.csv')
    matrix = [ast.literal_eval(str(row.alls)) for row in alls.itertuples()]
    subms = sorted(matrix[0])
    cols = [[data[i] for data in matrix] for i in range(len(subms))]
    df_subms = pd.DataFrame({f'col_{i}': [x[i] for x in matrix] for i in range(len(subms))})
    dossiers = [dossier(js,subms,cols) for js in range(len(subms))]
    subm_names = [one_dossier['name'] for one_dossier in dossiers]
    figures1,qss,i = [],[],0
    height = 101 if len(colors)==2\
        else 134 if len(colors)==3 else (154 if len(colors)==4 else 174)
    for one_dossier in dossiers: 
        i_col = 'alls. ' + str(one_dossier['q_in'][i]['c'])
        qs = [one['q'] for one in one_dossier['q_in']]
        x_names = [name.replace("Group","").replace("subm_","") for name in subm_names]
        width = 157  if len(colors) == 5\
            else (121 if len(colors) == 8\
            else (131 if len(colors) == 9\
            else (141 if len(colors) == 10\
            else (171 if len(colors) == 11 else 111))))
        f = figure(x_range=x_names,width=width, height=height, title=i_col)
        f.vbar(x=x_names, width=0.585, top=qs, color=colors)
        figures1.append(f)
        qss.append(qs)
        i+=1
    grid = gridplot([figures1])
    output_file('tida_alls.html')
    if show_figures1 == True: show(grid)
    sub_wts = params['subwts']
    main_wts = [subm['weight'] for subm in params['subm']]
    mms,acc_mass = [],[]
    for j in range(len(dossiers)):
        one_dossier = dossiers[j]
        qs = [one['q'] for one in one_dossier['q_in']]
        mm = [qs[h] * (main_wts[j] + sub_wts[h]) for h in range(len(sub_wts))]
        mass = sum(mm)
        mms.append(mm)
        acc_mass.append(round(mass))                        #subm_names[::-1]
    y_names = [name + " - " + str(mass) for name,mass in zip(subm_names,acc_mass)]
    f1 = figure(y_range=y_names, width=313, height=height, title='relations of general masses')
    f1.hbar(y=y_names, height=0.585, right=acc_mass, left=0, color=colors)
    output_file('tida_alls2.html')
    alls = [f'alls.{i}' for i in range(len(dossiers))]
    subm = [f'sub{i}'   for i in range(len(dossiers))] 
    mmsT  = np.asarray(mms).T
    data = {'cols' : alls}
    for i in range(len(dossiers)): data[f'sub{i}'] = mmsT[i,:]
    f2 = figure(y_range=alls, height=height, width=274, title=" ( relations of columns masses )")
    f2.hbar_stack(subm, y='cols', height=0.585, color=colors, source=data)
    qssT  = np.asarray(qss).T
    data = {'cols' : alls}
    for i in range(len(dossiers)): data[f'sub{i}'] = qssT[i,:]
    f3 = figure(y_range=alls, height=height, width=215, title="ratios in columns")
    f3.hbar_stack(subm, y='cols', height=0.585, color=colors, source=data)
    grid = gridplot([[f3,f2,f1]])
    show(grid)
    if show_figures2 == True:
        def read(params,i):
            FiN = params["path"] + params["subm"][i]["name"] + ".csv"
            target_name_back = {'target':params["target"],'pred':params["target"]}
            return pd.read_csv(FiN).rename(columns=target_name_back)
        dfs = [read(params,i) for i in range(len(params["subm"]))] + [df_cross]
        f   = figure(width=800, height=274)
        f.title.text = 'Click on legend entries to mute the corresponding lines'
        b,e        = 21000,21021
        line_x     = [dfs[i][b:e]['id']    for i in range(len(dfs))]
        line_y     = [dfs[i][b:e]['value'] for i in range(len(dfs))]
        color      = colors + [color_cross]
        alpha      = [0.8 for i in range(len(dfs)-1)] + [0.95]
        lws        = [1.0 for i in range(len(dfs)-1)] + [1.00]
        legend = subm_names + ['cross']
        for i in range(len(legend)):
            f.line(line_x[i], line_y[i], line_width=lws[i], color=color[i], alpha=alpha[i],
                   muted_color='white',legend_label=legend[i])
        f.legend.location = "top_left"
        f.legend.click_policy="mute"
        show(f)


def color_scheme(dk,color):
    colors    = ['red','green','blue','silver','gold']
    clr_Red   = ["crimson","orangered","red",'tomato',"firebrick",]
    clr_Green = ["green","limegreen","darkgreen","forestgreen",'lime']
    clr_Blue  = ["mediumblue","steelblue","blue","royalblue",'midnightblue']
    clr_silver= ['gray','darkgray','gainsboro','silver','whitesmoke']
    clr_alls  = ['crimson',"darkgreen",'mediumblue',"darkmagenta"]
    clr_Brown = ["maroon","sienna","sandybrown","chocolate",'brown',]
    l = len(dk['subm'])
    if color == 'alls'  : colors = clr_alls   [0:l]
    if color == 'red'   : colors = clr_Red    [0:l]
    if color == 'green' : colors = clr_Green  [0:l]
    if color == 'blue'  : colors = clr_Blue   [0:l]
    if color == 'brown' : colors = clr_Brown  [0:l]
    if color == 'silver': colors = clr_silver [0:l]
    return colors


def h_blend(params,color,cross='silver',
            figures1=False,figures2=False,wf2=555,
            details=False):

    import copy

    color_cross = cross

    dk = copy.deepcopy(params)

    show_details,show_figures1,show_figures2 = details,figures1,figures2

    file_short_names = [subm['name'] for subm in params['subm']]
    type_sort    = params['type_sort'][0]
    dk['asc']    = params['type_sort'][1]
    dk['desc']   = params['type_sort'][2]
    dk['id']     = params['id_target'][0]
    dk['target'] = params['id_target'][1]
# ------------------------------------------------------------------------
    def read(dk,i):
        tnm = dk["subm"][i]["name"]
        FiN = dk["path"] + tnm + ".csv"
        return pd.read_csv(FiN).rename(columns={
            'target':tnm, 'pred':tnm, dk["target"]:tnm})
        
    def merge(dfs_subm):
        df_subms = pd.merge(dfs_subm[0],  dfs_subm[1], on=[dk['id']])
        for i in range(2, len(dk["subm"])): 
            df_subms = pd.merge(df_subms, dfs_subm[i], on=[dk['id']])
        return df_subms
        
    def da(dk,sorting_direction,show_details):
        
        df_subms = merge([read(dk,i) for i in range(len(dk["subm"]))])
        cols = [col for col in df_subms.columns if col != dk['id']]
        short_name_cols = [c for c in cols]
        
        def alls1(x, sd=sorting_direction,cs=cols):
            reverse = True if sd=='desc' else False
            tes = {c: x[c] for c in cs}.items()
            subms_sorted = [t[0] for t in sorted(tes,key=lambda k:k[1],reverse=reverse)]
            return subms_sorted

        import random

        def alls2(x, sd=sorting_direction,cs=cols):
            reverse = True if sd=='desc' else False
            tes = {c: x[c] for c in cs}.items()
            subms_random = [t[0] for t in tes]
            random.shuffle(subms_random)
            return subms_random

        alls = alls1 if type_sort == 'asc/desc' else alls2
            
        def summa(x,cs,wts,ic_alls): 
            return sum([x[cs[j]] * (wts[0][j] + wts[1][ic_alls[j]]) for j in range(len(cs))])
            
        wts = [[[e['weight'] for e in dk["subm"]], [w for w in dk["subwts" ]]]]
          
        def correct(x, cs=cols, wts=wts):
            i = [x['alls'].index(c) for c in short_name_cols]
            return summa(x,cs,wts[0],i)

        if len(wts) == 1:
            correct_sub_weights = [wt for wt in dk["subwts"]]
            weights = [subm['weight'] for subm in dk["subm"]]
            def correct(x, cs=cols, w=weights, cw=correct_sub_weights):
                ic = [x['alls'].index(c) for c in short_name_cols]
                cS = [x[cols[j]] * (w[j] + cw[ic[j]]) for j in range(len(cols))]
                return sum(cS)
                   
        def amxm(x, cs=cols):
            list_values = x[cs].to_list()
            mxm = abs(max(list_values)-min(list_values))
            return mxm

        if len(wts) > 1:
            df_subms['mx-m']   = df_subms.apply(lambda x: amxm   (x), axis=1)
        df_subms['alls']       = df_subms.apply(lambda x: alls   (x), axis=1)
        df_subms[dk["target"]] = df_subms.apply(lambda x: correct(x), axis=1)
        schema_rename = { old_nc:new_shnc for old_nc, new_shnc in zip(cols, short_name_cols) }
        df_subms = df_subms.rename(columns=schema_rename)
        df_subms = df_subms.rename(columns={dk["target"]:"ensemble"})
        df_subms.insert(loc=1, column=' _ ', value=['   '] * len(df_subms))
        df_subms[' _ '] = df_subms[' _ '].astype(str)
        pd.set_option('display.max_rows',100)
        pd.set_option('display.float_format', '{:.15f}'.format)
        vcols = [dk['id']]+[' _ '] + short_name_cols + [' _ ']+['alls']+[' _ ']+['ensemble']
        if len(wts) > 1: vcols.append([' _ '] + ['mx-m'])
        df_subms = df_subms[vcols]
        if show_details and sorting_direction=='asc': display(df_subms.head(3))
        df_subms = df_subms.rename(columns={"ensemble":dk["target"]})
        for snc in short_name_cols: df_subms[snc] = df_subms[snc].round(15)
        df_subms.to_csv(f'tida_{sorting_direction}.csv', index=False)
        return df_subms[[dk['id'],dk['target']]]
   
    def ensemble_da(dk,        show_details): 
        dfD    = da(dk,'desc', show_details)
        dfA    = da(dk,'asc',  show_details)
        dfA[dk['target']] = dk['desc']*dfD[dk['target']] + dfA[dk['target']]*dk['asc']
        return dfA

    da = ensemble_da(dk,show_details)
    colors = color_scheme(dk, color)
    bokeh_show(dk, da, colors, show_figures1, show_figures2, wf2, color_cross)
    return  da


def matrix_vs(path,fs_names):
    def load(path,fs_names):
        dfs = [pd.read_csv(path + name_subm +'.csv') for name_subm in fs_names]
        for i in range(len(dfs)):
            dfs[i] = dfs[i].rename(columns={"accident_risk": f'{fs_names[i]}'})
        dfsm = pd.merge(dfs[0], dfs[1], on="id")
        for i in range(2,len(dfs)):
            dfsm = pd.merge(dfsm,dfs[i],on='id')
        return dfsm   
    def make_list_vs(fs_names):
        list = []
        for i in range(0,len(fs_names)-1):
            for j in range(i+1,len(fs_names)):
                list.append(fs_names[i] + "_vs_" + fs_names[j])
        return list
    def get_mvs(dfs, list_vs):
        def get_abs_distance(x,t1,t2):
            return abs(x[t1]-x[t2])
        for vs in list_vs:
            t = vs.split('_vs_')
            dfs[vs] = dfs.apply(lambda x: get_abs_distance(x,t[0],t[1]), axis=1)
        return dfs   
    def distance_vs(name, st_names, list_vs, dfs):
        distances = []
        for st in st_names:
            vs_between = name + "_vs_" + st
            if vs_between not in list_vs:
                distances.append(0)
            else: distances.append(round(dfs[vs_between].sum()))
        return distances
    dfs = load(path,fs_names)
    list_vs = make_list_vs(fs_names)
    mvs = get_mvs(dfs, list_vs)
    m1 = pd.DataFrame({'subm':fs_names})
    m2 = pd.DataFrame({ name :distance_vs(name, fs_names, list_vs, mvs) for name in fs_names})
    matrix = pd.concat([m1,m2],axis=1)
    return matrix

In [None]:

# subm_1["value"] = subm_original2['value'] + subm_original1['value'] / 100_000
# subm_2["value"] = subm_original2['value']

# subm_1.to_csv('subm_1.csv',index=False)
# subm_2.to_csv('subm_2.csv',index=False)

# params = {
#       'path'     : '/kaggle/working/',                           # v6.lb=0.11
#       'id_target': ['id',"value"],          
#       'type_sort': ['asc/desc',0.30,0.70],
#       'subwts'   : [ +0.002,-0.002 ],       
#       'subm'     : [
#          { 'name': f'subm_1','weight': 0.003 },
#          { 'name': f'subm_2','weight': 0.997 },]
# }
# df_cross = h_blend(params, color='alls', figures1=True, details=True)

#------------------------------------------------------------------------------

# subm_1["value"] = subm_original2['value'] + subm_original1['value'] / 10_000_000
# subm_2["value"] = subm_original2['value']

# subm_1.to_csv('subm_1.csv',index=False)
# subm_2.to_csv('subm_2.csv',index=False)

# params = {
#       'path'     : '/kaggle/working/',                            # v7.lb=0.11
#       'id_target': ['id',"value"],          
#       'type_sort': ['asc/desc',0.30,0.70],
#       'subwts'   : [ +0.00002,-0.00002 ],       
#       'subm'     : [
#          { 'name': f'subm_1','weight': 0.00003 },
#          { 'name': f'subm_2','weight': 0.99997 },]
# }
# df_cross = h_blend(params, color='alls', figures1=True, details=True)


#------------------------------------------------------------------------------

# subm_1["value"] = subm_original2['value'] + subm_original1['value'] / 10_000_000_000
# subm_2["value"] = subm_original2['value']

# subm_1.to_csv('subm_1.csv',index=False)
# subm_2.to_csv('subm_2.csv',index=False)

# params = {
#       'path'     : '/kaggle/working/',                            # v8.lb=0.11
#       'id_target': ['id',"value"],          
#       'type_sort': ['asc/desc',0.30,0.70],
#       'subwts'   : [ +0.00000001,-0.00000001 ],       
#       'subm'     : [
#          { 'name': f'subm_1','weight': 0.00000001 },
#          { 'name': f'subm_2','weight': 0.99999999 },]
# }
# df_cross = h_blend(params, color='alls', figures1=True, details=True)

#------------------------------------------------------------------------------

subm_1["value"] = subm_original2['value'] + subm_original1['value'] / 100_000
subm_2["value"] = subm_original2['value']

subm_1.to_csv('subm_1.csv',index=False)
subm_2.to_csv('subm_2.csv',index=False)

params = {
      'path'     : '/kaggle/working/',                           # v12.lb=0.11
      'id_target': ['id',"value"],          
      'type_sort': ['asc/desc',0.500001,0.499999],
      'subwts'   : [ +0.001,-0.001 ],       
      'subm'     : [
         { 'name': f'subm_1','weight': 0.001 },
         { 'name': f'subm_2','weight': 0.999 },]
}
df_cross = h_blend(params, color='alls', figures1=True, details=True)

In [None]:
for i in range(1,3): os.remove(f'/kaggle/working/subm_{i}.csv')

df_cross.to_csv('submission.csv', index=False)
df_cross.head(11)

In [None]:
import random

subm_original2['value'] =\
    subm_original2['value'] * random.choice([1.00007,1.00008,1.00010]) * 0.50 +\
    subm_original2['value'] / random.choice([1.00007,1.00008,1.00010]) * 0.50

In [None]:
subm_original2.to_csv('submission.csv', index=False)
subm_original2.head(11)