In [63]:
import pandas as pd
import numpy as np
import os
import sys

utils_path = os.path.abspath('../../utils')
sys.path.insert(0, utils_path)

from teeth_utils import normalise_teeth_distances

In [64]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None 

In [65]:
MERIDIAN_PATH = "../../data/life_data/life_sheet_final.csv"
TEETH_UP_PATH = "../../data/life_data/teeth_up_tracked_data.json"
TEETH_DOWN_PATH = "../../data/life_data/teeth_down_tracked_data.json"

In [66]:
teeth_up = pd.read_json(TEETH_UP_PATH)
teeth_down = pd.read_json(TEETH_DOWN_PATH)
teeth_df = pd.concat([teeth_up, teeth_down])
teeth_df = teeth_df.reset_index(drop=True)

In [77]:
len(teeth_df.iloc[0]["data"]["toothPositions"])

41

In [67]:
fn = lambda v: {'toothId': v['toothId'], 'toothPositions': normalise_teeth_distances(v['toothPositions'])}
teeth_df["data"] = teeth_df["data"].apply(fn)

In [68]:
# set notgrownout teeth positions to 0
fn = lambda data: {"toothId": data["toothId"], "toothPositions": [ 0.0 for v in data["toothPositions"]]}
teeth_df["data"][0] = fn(teeth_df["data"][0])
teeth_df["data"][15] = fn(teeth_df["data"][15])
teeth_df["data"][16] = fn(teeth_df["data"][16])
teeth_df["data"][31] = fn(teeth_df["data"][31])


In [69]:
# convert string teeth indices to ints
meridian_df = pd.read_csv(MERIDIAN_PATH)
meridian_df.dropna(subset=["Tooth number", "Domain"], inplace=True)
meridian_df.reset_index(drop=True, inplace=True)

fn = lambda s: [int(s) for s in s.split("-")]
meridian_df["Tooth number"] = meridian_df["Tooth number"].apply(fn)

In [70]:
meridian_df


Unnamed: 0,Domain,Group,Tooth number,Health condition related to tooth:,Brace 1 / 25 lugl-8 agosto,Brace 2/ 8 agosto -22 agosto,Brace 3/ 22 agosto-5 settembre,Brace 4/ 5 settembre-19 settembre,Brace 5/ 19 sett. 29 sett.,Brace 6/ 29 sett.9 ott.,Brace 7/ 9 ottobre - 19 ottobre,Brace 8/ 19 ottobre - 29 ottobre,Brace 9/ 29 ottobre- 9 novembre,Brace 10/ 9 novembre-19 novembre,Brace 11/ 19 novembre-29 novembre,Brace 12/ 29 novembre-8 dicembre,Brace 13/9 dicembre-19 dicembre,Brace 14 / 20 dec-30 dec,Brace 15/ 31 dec-10 genn,Brace 16/ 11 genn-21 genn,Brace 17/22 genn-1 febb,Brace 18/2 febbr-12 febb,Brace 19/13 febb-23 febb,Brace 20/24 febb-6 marz,Brace 21/ 7 marzo-17 marz,Brace 22/ 18 marz-28 marzo,Brace 23/ 29 marz-8 aprile
0,love,0.0,"[14, 24, 34, 44]",rheumatism,6.0,6.0,8.0,7.0,4.0,3.0,8.0,8.0,8.0,9.0,1.0,5.0,8.0,8.0,8.0,8.0,7.0,6.0,7.0,7.0,8.0,8.0,9.0
1,admiration,0.0,"[14, 24, 34, 44]",rheumatism,6.0,5.0,6.0,6.0,4.0,2.0,8.0,8.0,8.0,8.0,1.0,2.0,8.0,8.0,8.0,6.0,7.0,6.0,5.0,7.0,8.0,8.0,9.0
2,joy,0.0,"[14, 24, 34, 44]",rheumatism,6.0,7.0,9.0,8.0,4.0,2.0,8.0,8.0,8.0,8.0,1.0,4.0,8.0,8.0,8.0,7.0,7.0,6.0,7.0,7.0,8.0,8.0,9.0
3,approval,0.0,"[14, 24, 34, 44]",rheumatism,6.0,3.0,9.0,8.0,4.0,2.0,8.0,8.0,8.0,8.0,1.0,5.0,8.0,8.0,8.0,6.0,5.0,6.0,6.0,7.0,8.0,8.0,9.0
4,caring,0.0,"[41, 31, 11, 21]",,4.0,6.0,7.0,8.0,5.0,2.0,8.0,8.0,8.0,8.0,1.0,4.0,8.0,8.0,8.0,7.0,6.0,6.0,6.0,7.0,8.0,8.0,9.0
5,excitement,0.0,"[48, 38]","energy, headache, dizzyness",1.0,7.0,9.0,8.0,3.0,2.0,8.0,8.0,8.0,8.0,1.0,5.0,8.0,6.0,6.0,7.0,6.0,6.0,7.0,7.0,8.0,8.0,9.0
6,amusement,0.0,"[14, 24, 34, 44]",rheumatism,1.0,8.0,9.0,8.0,3.0,2.0,8.0,8.0,8.0,8.0,1.0,4.0,9.0,8.0,8.0,7.0,6.0,6.0,8.0,7.0,8.0,8.0,9.0
7,gratitude,0.0,"[21, 22, 23, 24, 25, 26, 27, 28]","inner world, past",6.0,8.0,9.0,8.0,3.0,2.0,8.0,8.0,8.0,8.0,1.0,5.0,9.0,7.0,7.0,7.0,6.0,5.0,8.0,7.0,8.0,8.0,9.0
8,desire,0.0,"[14, 24, 34, 44]",rheumatism,2.0,3.0,8.0,5.0,3.0,2.0,8.0,8.0,8.0,7.0,1.0,4.0,7.0,5.0,5.0,8.0,7.0,6.0,8.0,8.0,8.0,8.0,9.0
9,anger,0.0,"[13, 23, 33, 43]","focus, concentration",7.0,2.0,2.0,2.0,7.0,2.0,2.0,2.0,3.0,4.0,8.0,5.0,3.0,4.0,4.0,7.0,6.0,5.0,5.0,3.0,5.0,8.0,3.0


In [71]:
# create the training dataset
train_df = pd.DataFrame() 

# column represents data based on one of the multiple braces
# row is all teeth positions associated with a domain and the value of this domain 

n_teeth = teeth_df["data"].shape[0]
# the default position of irrelevant teeth
#non_tooth = [-1.0, -1.0]

# loop through meridian rows
for row_index, row in meridian_df.iterrows():
    # get life domains scores 
    braces_scores = row.filter(like='Brace').dropna()
    # no. of associated teeth scores, based on the number of braces 
    n_braces = braces_scores.shape[0]
    
    # get the meridian indexes of the relevant teeth
    teeth_ids = row["Tooth number"]
    # match relevant teeth against all teeth
    mask = teeth_df["data"].apply(lambda x: x['toothId'] in teeth_ids)
    associated_teeth = teeth_df["data"][mask]
    # get bool mask of the associated teeth
    mask = teeth_df["data"].isin(associated_teeth)
    # get the indexes of the associated teeth 
    associated_teeth_indexes = np.flatnonzero(mask)
    
    xys = []
    
    for brace_index in range(n_braces):
        x = np.full((len(associated_teeth_indexes), 2), [-1.0, -1.0])
        y = braces_scores.iloc[brace_index]

        for tooth_index, tooth in enumerate(associated_teeth):
            x[tooth_index] = tooth["toothPositions"][brace_index]
        xys.append([x,y])
        
    train_df = train_df.append(pd.Series(xys), ignore_index=True)

In [72]:
train_df.shape

(98, 23)

In [73]:
out_dir = os.path.abspath('./output')

if not os.path.exists(out_dir):
   os.makedirs(out_dir)

out_path = os.path.join(out_dir, "train_df.pickle")
train_df.to_pickle(out_path)