### Basic

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm, trange

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Functions

In [None]:
def knee_point(df, plot=True):
  df_sorted = df.sort_values(by='distance').reset_index(drop=True)
  y = df_sorted['distance'].values
  x = df.index

  p1 = np.array([x[0], y[0]])
  p2 = np.array([x[-1], y[-1]])

  line_vec = p2 - p1
  line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2))
  distances = []

  for i in range(len(x)):
    point = np.array([x[i], y[i]])
    point_vec = point - p1
    proj = point_vec.dot(line_vec_norm) * line_vec_norm
    distance = np.sqrt(np.sum((point_vec - proj) ** 2))
    distances.append(distance)

  knee_index = np.argmax(distances)
  knee_x = x[knee_index]
  knee_y = y[knee_index]

  if plot:
    fig, (plt1, plt2) = plt.subplots(1, 2, figsize=(9, 4))

    plt1.plot(x, y)
    plt1.set_xlabel("index")
    plt1.set_ylabel("Distances")
    plt1.set_title(f"Data plot")
    for label in plt1.get_xticklabels():
      label.set_rotation(45)

    plt2.plot(x, y)
    plt2.plot(knee_x, knee_y, 'rx')
    plt2.axvline(x=knee_x, color='r', linestyle='--')
    plt2.set_xlabel("index")
    plt2.set_ylabel("Distances")
    plt2.set_title(f"Knee Point of Data")
    for label in plt2.get_xticklabels():
      label.set_rotation(45)

    plt.show()

  print(f"Knee point for distance at x = {knee_x}, y = {knee_y}")
  print("-"*100, end="\n\n")


  return df_sorted, (knee_x, knee_y)

In [None]:
def plot_maker(df, knee = (0, 0), before=False, after=False, top=False):
  knee_x, knee_y = knee
  df_filtered = df[df.index > knee_x]

  # sample plot maker for random 100 files after knee point

  if before:
    print("100 samples just before the knee point")
    sample_data = df[knee_x-100:knee_x]
    draw_plots(sample_data)
  if after:
    print("100 samples just after the knee point")
    sample_data = df_filtered.head(100)
    draw_plots(sample_data)
  if top:
    print("Top 100 samples")
    sample_data = df_filtered.tail(100)
    draw_plots(sample_data)

  print("Random 100 samples  before the knee point")
  sample_data = df_filtered.sample(n=100, random_state=69)
  draw_plots(sample_data)

In [None]:
def draw_plots(sample_data):
  fig, plots = plt.subplots(20, 5, figsize=(20, 80))

  plots = plots.flatten()

  for i, (id, row) in enumerate(sample_data.iterrows()):
    plts = plots[i]

    plts.plot(row['x'], row['y'])

    plts.set_yscale('log')
    plts.set_xlim([1, 5])

    plts.set_title(f"{row['distance']}")
    plts.set_xlabel("x")
    plts.set_ylabel("y")

  plt.tight_layout()
  plt.show()

  print("-"*100, end="\n\n")

### Read

In [None]:
# single file
file_path = "/content/drive/MyDrive/Data/param_parquets/ch2_cla_l1_2024_01.parquet"
df = pd.read_parquet(file_path)

# usage
df_sorted, knee = knee_point(df)
plot_maker(df_sorted, knee=knee, before=True, after=True, top=True)