In [1]:
import pandas as pd
import numpy as np

import matplotlib.colors as mcolors
from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns

## Loading patient data

In [2]:
LTC_BINARY = '../data/ltc_events_all_patients_ukbb45840.tsv'

ltc_patients = pd.read_csv(LTC_BINARY, sep = '\t')

print(ltc_patients.shape)
ltc_patients.head(5)

(3110220, 10)


Unnamed: 0,patient_id,event_date,read_2,LTC,LTC_abbrev,sex,YOB,age_at_event,count_per_LTC,count_distinct_LTC
0,1000014,09/06/2004,N20..,Polymyalgia Rheumatica,PMR,F,1946,58,1,1
1,1000014,12/07/2011,F451.,Glaucoma,glaucoma,F,1946,65,1,2
2,1000059,27/10/1995,N05z5,Osteoarthritis (excl spine),OA,M,1942,53,1,1
3,1000059,01/09/1998,N05..,Osteoarthritis (excl spine),OA,M,1942,56,2,1
4,1000059,24/05/1999,N05z5,Osteoarthritis (excl spine),OA,M,1942,57,3,1


In [3]:
# Selecting all data except the repeated ones
df = ltc_patients[ltc_patients['count_per_LTC'] == 1]
df.reset_index(inplace=True, drop=True)
df.shape

(1244053, 10)

## Loading trained weights of diseases

In [4]:
df_weights = pd.read_csv('../data/final_x_idf.csv')
df_weights = df_weights.iloc[:,1:]
df_weights

Unnamed: 0,MLTC,0,1,2,3,idf,relative weight 0,relative weight 1,relative weight 2,relative weight 3,word occurence in docs
0,ADHD,0.000002,0.000004,0.000008,0.000913,3.298513,0.007428,0.015877,0.028177,3.247033,71
1,BAD,0.000002,0.000004,0.000009,0.013494,2.109351,0.000345,0.000669,0.001452,2.106885,1112
2,BPH,0.000002,0.035334,0.000007,0.000019,1.160299,0.000080,1.159369,0.000222,0.000627,9897
3,CCD,0.000003,0.004828,0.000008,0.000018,2.002031,0.001112,1.990319,0.003201,0.007399,1424
4,CHD,0.000003,0.051749,0.000008,0.022744,0.950506,0.000036,0.660205,0.000102,0.290164,16044
...,...,...,...,...,...,...,...,...,...,...,...
198,varices_portal_hypert,0.000002,0.000004,0.000006,0.004055,2.614267,0.001463,0.002823,0.004155,2.605826,347
199,venous_lymphatic,0.025181,0.010252,0.003071,0.000036,0.905719,0.591779,0.240931,0.072161,0.000848,17787
200,venous_thromb,0.004235,0.010407,0.000013,0.000062,1.427005,0.410622,1.009032,0.001314,0.006038,5355
201,vitb12_deficiency,0.005896,0.000068,0.000014,0.019971,1.454019,0.330379,0.003841,0.000805,1.118994,5032


In [5]:
df.head(4)

Unnamed: 0,patient_id,event_date,read_2,LTC,LTC_abbrev,sex,YOB,age_at_event,count_per_LTC,count_distinct_LTC
0,1000014,09/06/2004,N20..,Polymyalgia Rheumatica,PMR,F,1946,58,1,1
1,1000014,12/07/2011,F451.,Glaucoma,glaucoma,F,1946,65,1,2
2,1000059,27/10/1995,N05z5,Osteoarthritis (excl spine),OA,M,1942,53,1,1
3,1000059,01/08/2002,M07z.,Chronic ulcer of the skin,skin_ulcer,M,1942,60,1,2


# Visualizing top n numbers of row containing patient data

In [6]:
n = 72

test_1 = df[:n]
test_1

Unnamed: 0,patient_id,event_date,read_2,LTC,LTC_abbrev,sex,YOB,age_at_event,count_per_LTC,count_distinct_LTC
0,1000014,09/06/2004,N20..,Polymyalgia Rheumatica,PMR,F,1946,58,1,1
1,1000014,12/07/2011,F451.,Glaucoma,glaucoma,F,1946,65,1,2
2,1000059,27/10/1995,N05z5,Osteoarthritis (excl spine),OA,M,1942,53,1,1
3,1000059,01/08/2002,M07z.,Chronic ulcer of the skin,skin_ulcer,M,1942,60,1,2
4,1000059,01/08/2002,M07z.,Dermatitis (atopc/contact/other/unspecified),dermatitis,M,1942,60,1,3
...,...,...,...,...,...,...,...,...,...,...
67,1000296,01/01/1972,H33..,Asthma,asthma,F,1949,23,1,1
68,1000296,11/08/1977,1AZ2.,Female infertility,female_infertility,F,1949,28,1,2
69,1000296,25/09/1990,K5920,Menorrhagia and polymenorrhoea,menorrhagia,F,1949,41,1,3
70,1000296,20/11/2002,N05z7,Osteoarthritis (excl spine),OA,F,1949,53,1,4


In [6]:
# Divides the history ito 3 parts and models the trajectory

for name, group in test_1.groupby('patient_id'):
    res_interim = pd.DataFrame()
    res_interim2 = pd.DataFrame()
    
    
    lng = (len(group))
    fifty = (lng//2)
    
    print('Patient ID: ',name)
    print('Total count of MLTC\'s: ',lng, '\n')
    print(list(group.reset_index()['LTC_abbrev']))
    
    for i in range(len(group.head(fifty).reset_index()['LTC_abbrev'])):
        dis = group.head(fifty).reset_index()['LTC_abbrev'][i]
        res_interim = pd.concat([res_interim, df_weights[['relative weight 0', 'relative weight 1', 'relative weight 2', 'relative weight 3']][df_weights['MLTC']== dis]])
        
        
    # print(res_interim)    
    a = res_interim.iloc[0].idxmax()    
    b = res_interim.sum().idxmax()
    # print(res_interim)
    print('First Cluster: ', a)
    print('Cluster after 50 perc.(', fifty ,') of MLTC\'s: ', b)
        


    for i in range(len(group.reset_index()['LTC_abbrev'])):
        dis = group.reset_index()['LTC_abbrev'][i]
        res_interim2 = pd.concat([res_interim2, df_weights[['relative weight 0', 'relative weight 1', 'relative weight 2', 'relative weight 3']][df_weights['MLTC']== dis]])
    
    c = res_interim2.sum().idxmax()
    
    # print(res_interim2)
    print('Cluster after full history of MLTC\'s: ', c, '\n')
    
    
    print(a ,'----------->',b,'----------->', c,'\n')
    print('-------------------------------------------------------------------------------------------')

Patient ID:  1000014
Total count of MLTC's:  2 

['PMR', 'glaucoma']
First Cluster:  relative weight 0
Cluster after 50 perc.( 1 ) of MLTC's:  relative weight 0
Cluster after full history of MLTC's:  relative weight 1 

relative weight 0 -----------> relative weight 0 -----------> relative weight 1 

-------------------------------------------------------------------------------------------
Patient ID:  1000059
Total count of MLTC's:  5 

['OA', 'skin_ulcer', 'dermatitis', 'erectile_dysfunction', 'pri_skin']
First Cluster:  relative weight 1
Cluster after 50 perc.( 2 ) of MLTC's:  relative weight 0
Cluster after full history of MLTC's:  relative weight 0 

relative weight 1 -----------> relative weight 0 -----------> relative weight 0 

-------------------------------------------------------------------------------------------
Patient ID:  1000062
Total count of MLTC's:  7 

['spondylosis', 'obesity', 'urine_incont', 'female_genital_prolapse', 'type_2_diabetes', 'unspecified_rare_diabe

### Add Custom patient history

In [8]:
hist = ['OA', 'skin_ulcer', 'dermatitis', 'erectile_dysfunction', 'pri_skin']

lng = (len(hist))
fifty = (lng//2)


print('Total count of MLTC\'s: ',lng, '\n')
print(hist)

res_interim3 = pd.DataFrame()
for i in range(fifty):
    res_interim3 = pd.concat([res_interim3, df_weights[['relative weight 0', 'relative weight 1', 'relative weight 2', 'relative weight 3']][df_weights['MLTC']== hist[i]]])

# print(res_interim3)
a = res_interim3.iloc[0].idxmax()    
b = res_interim3.sum().idxmax()
# print(res_interim)
print('First Cluster: ', a)
print('Cluster after 50 perc.(', fifty ,') of MLTC\'s: ', b)


# Try: Improve by adding weights of the rest of the diseases to previous res_interim
res_interim4 = pd.DataFrame()
for i in range(lng):
    res_interim4 = pd.concat([res_interim4, df_weights[['relative weight 0', 'relative weight 1', 'relative weight 2', 'relative weight 3']][df_weights['MLTC']== hist[i]]])

c = res_interim4.sum().idxmax()

# print(res_interim4)
print('Cluster after full history of MLTC\'s: ', c, '\n')


print(a ,'----------->',b,'----------->', c,'\n')
print('-------------------------------------------------------------------------------------------')

Total count of MLTC's:  5 

['OA', 'skin_ulcer', 'dermatitis', 'erectile_dysfunction', 'pri_skin']
First Cluster:  relative weight 1
Cluster after 50 perc.( 2 ) of MLTC's:  relative weight 0
Cluster after full history of MLTC's:  relative weight 0 

relative weight 1 -----------> relative weight 0 -----------> relative weight 0 

-------------------------------------------------------------------------------------------


### For each MLTC

In [16]:
# Models the trajectory after each disease


# hist = ['OA', 'skin_ulcer', 'dermatitis', 'erectile_dysfunction', 'pri_skin']
hist = ['OA', 'skin_ulcer', 'dermatitis']
# hist  = ['spondylosis', 'obesity', 'urine_incont', 'female_genital_prolapse', 'type_2_diabetes', 'unspecified_rare_diabetes', 'fracture_hip']

print('Total count of LTC\'s: ',len(hist))
print(hist, '\n\n')
print('PATIENT PROGRESSION\n')
res_interim5 = pd.DataFrame()
for i in range(len(hist)):
    res_interim5 = pd.concat([res_interim5, df_weights[['relative weight 0', 'relative weight 1', 'relative weight 2', 'relative weight 3']][df_weights['MLTC']== hist[i]]])

    # print(res_interim5.sum())
    if i == 0:
        if res_interim5.iloc[0].idxmax() == 'relative weight 0':
            cluster_name = 'Cluster 0'
        elif res_interim5.iloc[0].idxmax() == 'relative weight 1':
            cluster_name = 'Cluster 1'
        elif res_interim5.iloc[0].idxmax() == 'relative weight 2':
            cluster_name = 'Cluster 2'
        elif res_interim5.iloc[0].idxmax() == 'relative weight 3':
            cluster_name = 'Cluster 3'
        
        print('First Cluster: ', cluster_name)
    else:
        if res_interim5.sum().idxmax() == 'relative weight 0':
            cluster_name = 'Cluster 0'
        elif res_interim5.sum().idxmax() == 'relative weight 1':
            cluster_name = 'Cluster 1'
        elif res_interim5.sum().idxmax() == 'relative weight 2':
            cluster_name = 'Cluster 2'
        elif res_interim5.sum().idxmax() == 'relative weight 3':
            cluster_name = 'Cluster 3'
        
        
        print('Order', i+1 ,': ', cluster_name)

print('-------------------------------------------------------------------------------------------')

Total count of LTC's:  3
['OA', 'skin_ulcer', 'dermatitis'] 


PATIENT PROGRESSION

First Cluster:  Cluster 1
Order 2 :  Cluster 0
Order 3 :  Cluster 0
-------------------------------------------------------------------------------------------


In [141]:
# temp_list = []
# for name, group in test_1.groupby('patient_id'):
#     for i in range(len(group.head(2).reset_index()['LTC_abbrev'])):
#         print((group.head(2).reset_index()['LTC_abbrev'][i]))
#     # for i in group.head(3).iterrows():
#     #     print(i.index.values)
# # temp_list

PMR
glaucoma
OA
skin_ulcer
