# searching similar u_in using cosine similarity
There are many u_in with very similar shapes.  
Here, I will use **cosine similarity** to get breath_ids with similar u_in patterns   
and compare their u_in and pressure.  

[References]  
https://www.kaggle.com/marutama/eda-about-u-in   
@marutama 's great EDA notebook

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
train['RC'] = train['R'].astype(str) + '_' + train['C'].astype(str)

In [None]:
# breath_id × u_in(0 - 79)
train['num'] = train.groupby(['breath_id', 'RC']).cumcount()
u_in_table = pd.pivot_table(train, index=['breath_id', 'RC'], columns='num', values='u_in')
u_in_table = u_in_table.reset_index()

In [None]:
def search_similar_id(target_id, th):
    """
    Find a breath_id with a similar u_in pattern.

    Parameters
    ----------
    target_id : int
        Target breath_id
    th : float
        Cosine similarity threshold

    Returns
    -------
    similar_id : list
        List of ids with similar patterns
    """
    
    if target_id in u_in_table['breath_id'].tolist():
        target_rc = u_in_table[u_in_table['breath_id']==target_id]['RC'].values[0]
        target_vec = u_in_table[u_in_table['breath_id']==target_id].to_numpy()[:,2:]

        refer_table = u_in_table[(u_in_table['RC']==target_rc) & (u_in_table['breath_id']!=target_id)].reset_index(drop=True)
        refer_vec = refer_table.to_numpy()[:,2:]
        breaths = refer_table['breath_id'].unique().tolist()
        breath_map = {i:b for i,b in enumerate(breaths)}

        cs = cosine_similarity(target_vec, refer_vec)[0]
        similar_idx = list(np.where(cs>th)[0])
        similar_id = [breath_map[i] for i in similar_idx]
    
    else:
        similar_id = []
        
    return similar_id

In [None]:
def viz_similar_id(target_id, th=0.999):
    """
    Visualize and compare u_in and pressure of breath_id with similar patterns.

    Parameters
    ----------
    target_id : int
        Target breath_id
    th : float
        Cosine similarity threshold
    """
    rc = u_in_table[u_in_table['breath_id']==target_id]['RC'].values[0]
    similar_ids = search_similar_id(target_id, th)
    viz_id = [target_id] + similar_ids
    
    fig, axes = plt.subplots(figsize=(16, 9), nrows=2,sharex=True)
    for id_ in viz_id:
        tmp = train[train['breath_id']==id_].copy()
        axes[0].plot(tmp['time_step'], tmp['u_in'], label='breath_id : ' + str(id_))
        axes[1].plot(tmp['time_step'], tmp['pressure'], label='breath_id : ' + str(id_))
        axes[0].legend(loc='upper right')
        axes[0].grid(color='g', linestyle=':', linewidth=0.3)
        axes[0].set_title('u_in')
        axes[1].legend(loc='upper right')
        axes[1].grid(color='g', linestyle=':', linewidth=0.3)
        axes[1].set_title('pressure')
        fig.suptitle(f'target_id : {target_id}  (RC={rc})')

In [None]:
viz_similar_id(target_id=1)

In [None]:
viz_similar_id(target_id=5)

In [None]:
viz_similar_id(target_id=2)

In [None]:
viz_similar_id(target_id=2, th=0.998)