In [11]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import copy
from operator import itemgetter
import numpy as np

def initially_preprocessed_data_frame():
    df = pd.read_csv('all_data.csv')
    df['isotherm_X'] = df['isotherm_X'].apply(json.loads) # make it as float list
    df['isotherm_Y'] = df['isotherm_Y'].apply(json.loads) # make it as float list
    df['isotherm_X_Y'] = df['isotherm_X_Y'].apply(json.loads) # make it as float list
    df['axis_minx_maxx_miny_maxy']  = df['axis_minx_maxx_miny_maxy'].apply(json.loads) # make it as float list
    
    df['isotherm_X'] = df['isotherm_X'].apply(lambda x: [abs(num) for num in x]) # remove the minus sign
    df['isotherm_Y'] = df['isotherm_Y'].apply(lambda y: [abs(num) for num in y]) # remove the minus sign
    
    constant = 22.414
    df['isotherm_Y'] = df.apply( lambda row: [value * constant for value in row['isotherm_Y']] if row['Y_axis_type'] == 'mmol/g' else row['isotherm_Y'], axis=1 )
    df['isotherm_X_Y'] = df.apply( lambda row: [[value[0],(value[1] * constant)] for value in row['isotherm_X_Y']] if row['Y_axis_type'] == 'mmol/g' else row['isotherm_X_Y'], axis=1 )
    df['isotherm_X_Y'] = df.apply( lambda row: [[value[0],(value[1] * 1000)] for value in row['isotherm_X_Y']] if row['Y_axis_type'] == 'x1000' else row['isotherm_X_Y'], axis=1 )
    df['isotherm_X_Y'] = df.apply( lambda row: [[(value[0] / 100), value[1]] for value in row['isotherm_X_Y']] if row['Article_name'] == 'Data78' else row['isotherm_X_Y'], axis=1 )
    # print(df['isotherm_X_Y'] )

    # deleteing samples -------------
    df = df[~((df['Article_name'] == 'Data73') )]
    df = df[~((df['Article_name'] == 'Data15') )]
    df = df[~((df['Article_name'] == 'Data44') & (df['Sample_name'] == 'UAK2-800d-without_washing')  & (df['Figure_number'] == '5'))]
    df = df[~((df['Article_name'] == 'Data94') & (df['Sample_name'] == 'UFFA-2-500')  & (df['Figure_number'] == '2_1'))]
    df = df[~((df['Article_name'] == 'Data23') & (df['Sample_name'] == '0%BO')  & (df['Figure_number'] == '2_2'))]
    df = df[~((df['Article_name'] == 'Data33') & (df['Sample_name'] == '600')  & (df['Figure_number'] == '2'))]
    df = df[~((df['Article_name'] == 'Data91') & (df['Sample_name'] == 'AC-MM5')  & (df['Figure_number'] == '3_3'))]
    # deleteing samples -------------

    return df

def data_from_csv():
    df = initially_preprocessed_data_frame()
    list_X_Y = df['isotherm_X_Y'][df['Total_surface_area[m2/g]'].notna()].to_list()
    list_BET = df['Total_surface_area[m2/g]'][df['Total_surface_area[m2/g]'].notna()].to_list()
    Article_name = df['Article_name'][df['Total_surface_area[m2/g]'].notna()].to_list()
    Sample_name = df['Sample_name'][df['Total_surface_area[m2/g]'].notna()].to_list()
    Figure_number = df['Figure_number'][df['Total_surface_area[m2/g]'].notna()].to_list()
    Curve_type = df['Curve_type'][df['Total_surface_area[m2/g]'].notna()].to_list()
    list_info=[]
    for i in range(len(Article_name)):
        list_info.append([Article_name[i], Sample_name[i], Figure_number[i], Curve_type[i] ])
        
    return list_X_Y, list_BET, list_info
    
oryginal_isotherms, bet, info = data_from_csv()
print(f"oryginal_isotherms: {len(oryginal_isotherms)}")
print(f"oryginal_bet: {len(bet)}")
print(f"oryginal_info: {len(info)} \n")

def fit_shape_of_izotherms(oryginal_izotherms, new_size, x_range_to_cut): #sorting, add or remove points between

    def sort_isotherms(oryginal_izotherms): #sort isotherm from (0,0) to (1,...)
        list_to_be_change = copy.deepcopy(oryginal_izotherms)
        izotherms_x_y_normalized = []
        for x_y in list_to_be_change:
            temp=[]
            temp = sorted(x_y, key=itemgetter(0)) # at index 0 from each element in x_y, so sorting by x value from 0 to 1
            izotherms_x_y_normalized.append(temp)
        return izotherms_x_y_normalized

    def cut_x_range(oryginal_izotherms, x_range):
        if(x_range):
            print(f'x_range_to_cut->{x_range}')
            ist_to_be_change = copy.deepcopy(oryginal_izotherms)
            izotherms_x_y_normalized = []
            for x_y in list_to_be_change:
                filtered_data = [pair for pair in x_y if pair[0] >= x_range]
                izotherms_x_y_normalized.append(filtered_data)
            return izotherms_x_y_normalized
        else:
            print(f'x_range_to_cut->{x_range}')
            return oryginal_izotherms

    def add_new_points_in_first_iteration(oryginal_izotherm):
        izotherm_lenght = len(oryginal_izotherm)
        new_izoterm = []
        for p in range(izotherm_lenght):
            if(izotherm_lenght > p+1):
                new_izoterm.append(oryginal_izotherm[p])
                new_x = (oryginal_izotherm[p][0] + oryginal_izotherm[p+1][0])/2
                new_y = (oryginal_izotherm[p][1] + oryginal_izotherm[p+1][1])/2
                new_izoterm.append([new_x,new_y])
            else: new_izoterm.append(oryginal_izotherm[p])
        return new_izoterm            
        
    def add_new_points_in_specific_number(oryginal_izotherm, number_of_points):
        izotherm_lenght = len(oryginal_izotherm)
        new_izoterm = []
        for p in range(izotherm_lenght):
            if(number_of_points > p):
                new_izoterm.append(oryginal_izotherm[p])
                new_x = (oryginal_izotherm[p][0] + oryginal_izotherm[p+1][0])/2
                new_y = (oryginal_izotherm[p][1] + oryginal_izotherm[p+1][1])/2
                new_izoterm.append([new_x,new_y])
            else: new_izoterm.append(oryginal_izotherm[p])
        return new_izoterm
        
    def make_calculation(izotherm):
        return (2*izotherm)-1
        

    list_to_be_change = copy.deepcopy(oryginal_izotherms)
    list_to_be_change = sort_isotherms(list_to_be_change)

    list_to_be_change = cut_x_range(list_to_be_change, x_range_to_cut)
    
    
    result = [xy.insert(0, [0,0]) for xy in list_to_be_change if not np.any(xy[0] == [0,0])]

    izotherms_x_y_normalized = []
    
    for x_y in list_to_be_change:
        izotherm_to_be_change = x_y
        old_size = len(izotherm_to_be_change)
        # print("old_size",old_size, end="")
        # print(f"old_size {old_size}")
        
        if(old_size < new_size):
            no_of_whole_iteration = 0
            temp_size = old_size
            number_of_additional_points = 0
            while(temp_size < new_size):
                if(new_size - temp_size+1 > old_size and make_calculation(temp_size) < new_size):
                    temp_size = make_calculation(temp_size)
                    no_of_whole_iteration  += 1
                else:
                    number_of_additional_points = new_size - temp_size                
                    temp_size = temp_size + (new_size - temp_size)

            new_izotherm_with_whole_iterations = izotherm_to_be_change
            for i in range(no_of_whole_iteration):
                new_izotherm_with_whole_iterations = add_new_points_in_first_iteration(new_izotherm_with_whole_iterations)
                # print(f"iteration[{i+1}] -> {len(new_izotherm_with_whole_iterations)}")
            final_izotherm =  add_new_points_in_specific_number(new_izotherm_with_whole_iterations, number_of_additional_points)
            
        elif(old_size > new_size):
            
            # regular_points = points[::2]
            # Liczba punktów, które chcemy wybrać
            num_points = new_size  # na przykład wybieramy 5 punktów z 10
            indices = np.linspace(0, len(x_y) - 1, num_points, dtype=int)# Obliczanie indeksów punktów do wybrania
            final_izotherm = [x_y[i] for i in indices]# Wybieranie punktów z oryginalnej listy na podstawie indeksów
            
        elif(old_size == new_size):    
            final_izotherm = x_y
            # print(f"temp_size {temp_size}")
            # print(f"no_of_whole_iteration {no_of_whole_iteration} ")
            # print(f"number_of_additional_points: {number_of_additional_points}") 
        # print("new_size",len(final_izotherm))
        izotherms_x_y_normalized.append(final_izotherm)

    return izotherms_x_y_normalized



def get_data(new_size,x_range_to_cut=None, flat=True):
    isotherms_copy = copy.deepcopy(oryginal_isotherms)
    izotherm_x_y = isotherms_copy 
     
    izotherm_x_y_in_new_size = fit_shape_of_izotherms(izotherm_x_y, new_size, x_range_to_cut) # make data with even points number 
    X_np_array = np.array(izotherm_x_y_in_new_size)
    y = np.array(bet)
    y = y.astype(float)
   
    if(flat):
        X = np.array([xy.flatten() for xy in X_np_array])
        print(f"flat=True, returned one dimension data= [x,y,x,y,x,y,...]")
    else:
        X = X_np_array
        print(f"flat=False, returned two dimension data")

    print("X",X.shape)
    print("y",y.shape)
    print("info",len(info))
    print("oryginal_isotherms",len(oryginal_isotherms))
    print(f"return: X, y, info, oryginal_isotherms")


    return  X, y, info, oryginal_isotherms




# elo = get_data_for_training_standarize(40)

# X1, y1, info1, oryginal_isotherms1=get_data(40,False)

# print(X1)


oryginal_isotherms: 912
oryginal_bet: 912
oryginal_info: 912 

flat=False, returned two dimension data
X (912, 40, 2)
y (912,)
info 912
oryginal_isotherms 912
return: X, y, info, oryginal_isotherms
[[[2.96306184e-02 5.09723484e+01]
  [4.64534708e-02 5.67000020e+01]
  [7.37047937e-02 5.81849493e+01]
  ...
  [9.55040327e-01 1.15680292e+02]
  [9.75344188e-01 1.22838191e+02]
  [9.96259348e-01 1.48486552e+02]]

 [[2.68530891e-02 1.22077706e+02]
  [5.06379583e-02 1.30665851e+02]
  [7.84811739e-02 1.39263335e+02]
  ...
  [9.51148147e-01 2.13094432e+02]
  [9.74933017e-01 2.21682577e+02]
  [9.95272051e-01 2.49462965e+02]]

 [[2.60331721e-02 3.21188718e+02]
  [5.16009969e-02 3.55381196e+02]
  [7.71518411e-02 3.79617990e+02]
  ...
  [9.56976107e-01 5.70799944e+02]
  [9.77279968e-01 5.77957843e+02]
  [9.95280542e-01 5.94354970e+02]]

 ...

 [[3.16255534e-02 1.90546147e+02]
  [4.17457306e-02 1.93369971e+02]
  [5.18659077e-02 1.96193795e+02]
  ...
  [8.99430740e-01 2.37799100e+02]
  [9.50664137e-01 