In [47]:
import numpy as np 
import pandas as pd 
import ast

In [10]:
patients_df = pd.read_csv("./data/all_patients.csv")
patients_df.head()

Unnamed: 0,patient id,timepoint,telo data,telo means,Q1,Q2-3,Q4
0,1,1 non irrad,"[79.18994405924711, 58.07204491719145, 95.0279...",84.796483,1195,2225,1180
1,1,2 irrad @ 4 Gy,"[149.93296075217452, 138.31843562348496, 106.6...",90.975826,724,2350,1526
2,1,3 B,"[176.32960877192357, 111.92066838585988, 123.5...",116.779989,231,1457,2912
3,1,4 C,"[144.65363114822472, 84.46927366319692, 78.133...",99.346299,372,2241,1987
4,2,1 non irrad,"[95.38700619112329, 187.82577704647716, 53.102...",119.773675,1166,2270,1164


In [17]:
melted_pdf = pd.melt(patients_df,
    id_vars = [col for col in patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],
    var_name='relative Q',
    value_name='Q freq counts',
    ignore_index = True)

melted_pdf["Q freq counts"] = melted_pdf["Q freq counts"].astype('float64') 
melted_pdf.head()

Unnamed: 0,patient id,timepoint,telo data,telo means,relative Q,Q freq counts
0,1,1 non irrad,"[79.18994405924711, 58.07204491719145, 95.0279...",84.796483,Q1,1195.0
1,1,2 irrad @ 4 Gy,"[149.93296075217452, 138.31843562348496, 106.6...",90.975826,Q1,724.0
2,1,3 B,"[176.32960877192357, 111.92066838585988, 123.5...",116.779989,Q1,231.0
3,1,4 C,"[144.65363114822472, 84.46927366319692, 78.133...",99.346299,Q1,372.0
4,2,1 non irrad,"[95.38700619112329, 187.82577704647716, 53.102...",119.773675,Q1,1166.0


In [26]:
pivot_TML_df = patients_df.pivot(index='patient id', columns='timepoint', values='telo means')
pivot_TML_df.drop(13,inplace=True) #Presence of NaN
pivot_TML_df.head()

timepoint,1 non irrad,2 irrad @ 4 Gy,3 B,4 C
patient id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,84.796483,90.975826,116.779989,99.346299
2,119.773675,133.199443,159.827558,108.915327
3,83.350928,87.295453,101.432564,95.669501
5,85.506106,113.09598,118.340459,97.83219
6,81.57797,86.403786,96.898929,130.11894


In [52]:
# can imagine the lists containing the individual telos per patient exploding to the right; maintains the index relationship
explode_telos_raw = patients_df['telo data'].apply(ast.literal_eval)
explode_telos_raw = explode_telos_raw.apply(pd.Series)
print(explode_telos_raw.shape)
explode_telos_raw.head(4)

(59, 4600)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4590,4591,4592,4593,4594,4595,4596,4597,4598,4599
0,79.189944,58.072045,95.027933,106.642458,55.960334,62.296089,81.301676,109.810056,123.536313,181.608938,...,71.798883,122.480447,50.681564,81.301676,40.122905,115.088234,114.032379,59.128492,63.351322,46.457636
1,149.932961,138.318436,106.642458,101.363128,101.363128,80.24581,111.921788,127.758499,102.418994,57.01676,...,47.513966,92.916201,84.469274,89.748603,47.513966,102.418994,117.201117,107.697247,92.916201,71.798883
2,176.329609,111.920668,123.535077,177.385475,117.201117,139.374302,99.251397,51.736913,145.70804,154.156424,...,127.759776,66.519553,92.916201,48.569832,145.709497,114.033519,156.268156,137.26257,141.486033,182.664804
3,144.653631,84.469274,78.133297,139.372908,98.195531,53.849162,68.631285,62.296089,76.022346,125.578131,...,112.977654,109.808958,111.920668,127.759776,105.586592,100.307262,128.814354,143.597765,157.664571,115.089385


In [85]:
telos = (explode_telos_raw
    .merge(patients_df,right_index=True,left_index=True)
    .drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)
    .melt(id_vars = ['patient id', 'timepoint', 'telo means'], value_name = "individual telomeres") 
    .drop("variable",axis=1)
    .dropna()
)
telos.head()

Unnamed: 0,patient id,timepoint,telo means,individual telomeres
0,1,1 non irrad,84.796483,79.189944
1,1,2 irrad @ 4 Gy,90.975826,149.932961
2,1,3 B,116.779989,176.329609
3,1,4 C,99.346299,144.653631
4,2,1 non irrad,119.773675,95.387006


In [86]:
telos.to_csv('./data/telos.csv', index=False)
