In [None]:
import ast
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def read_csv(file_name):
    return pd.read_csv(f'Data/{file_name}')

def convert_to_tuple(df, column):
    df[column] = df[column].apply(lambda x: tuple(map(int, x.strip('()').split(','))))

def calculate_distance(croped_slice_shape, center_min_circle, ct_numbers_in_heart, ct_number):
    x1, y1 = 0, 0
    x2, y2 = croped_slice_shape
    z1, z2 = min(ct_numbers_in_heart), max(ct_numbers_in_heart)
    x, y = center_min_circle
    z = ct_number
    return [abs(x - x1), abs(x - x2), abs(y - y1), abs(y - y2), abs(z - z1), abs(z - z2)]

ct_slices_train = read_csv('ct_slices_train.csv')
manual_labels_train = read_csv('manual_labels_train.csv')
patient_overview_train = read_csv('patient_overview_train.csv')

df1 = ct_slices_train.copy()
df1['Total_score'] = df1['pt_id'].map(manual_labels_train.set_index('pt_id')['Total_score'])
df1['pixel_count'] = df1['pixel_values_hu'].apply(lambda x: len(x.split()))
df1['total_pixel_count'] = df1['pixel_count'].groupby(df1['pt_id']).transform('sum')
df1['pixel_values_list'] = df1['pixel_values_hu'].apply(lambda x: ast.literal_eval(x))
df1['mean_pixel_value'] = df1['pixel_values_list'].apply(lambda x: sum(x)/len(x))
df1['max_pixel_value'] = df1['pixel_values_list'].apply(max)

convert_to_tuple(df1, 'croped_slice_shape')
convert_to_tuple(df1, 'center_min_circle')

patient_overview_train['ct_numbers_in_heart'] = patient_overview_train['ct_numbers_in_heart'].apply(lambda x: list(map(int, ast.literal_eval(x))))

for dist in ['dist_to_x1', 'dist_to_x2', 'dist_to_y1', 'dist_to_y2', 'dist_to_z1', 'dist_to_z2']:
    df1[dist] = np.nan

df1 = df1[~df1['pt_id'].isin([1283, 1284])]

for i in df1.index:
    croped_slice_shape = df1.loc[i, 'croped_slice_shape']
    center_min_circle = df1.loc[i, 'center_min_circle']
    pt_id = df1.loc[i, 'pt_id']
    ct_number = df1.loc[i, 'ct_number']
    ct_numbers_in_heart = patient_overview_train[patient_overview_train['pt_id'] == pt_id]['ct_numbers_in_heart'].values[0]
    df1.loc[i, ['dist_to_x1', 'dist_to_x2', 'dist_to_y1', 'dist_to_y2', 'dist_to_z1', 'dist_to_z2']] = calculate_distance(croped_slice_shape, center_min_circle, ct_numbers_in_heart, ct_number)

df2 = df1[['pt_id','Total_score', 'pixel_count', 'mean_pixel_value', 'max_pixel_value', 'dist_to_x1', 'dist_to_x2', 'dist_to_y1', 'dist_to_y2', 'dist_to_z1', 'dist_to_z2']]

grouped = {p_id: grp.drop(['pt_id', 'Total_score'], axis = 1).values for p_id, grp in df2.groupby('pt_id')}
labels = df2[['pt_id', 'Total_score']].drop_duplicates().set_index('pt_id')['Total_score'].to_dict()

X = [np.array(features) for features in grouped.values()]
X_padded = pad_sequences(X, dtype='float32', padding='post')
Y = df2.groupby('pt_id')['Total_score'].first().values

In [None]:
X_padded_flatten = X_padded.reshape(X_padded.shape[0], -1)

In [None]:
X_padded_flatten.shape, Y.shape