In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.offline as po
import plotly.graph_objs as go
from plotly import tools
import load
import descriptions
import gensim
po.init_notebook_mode(connected=True)

## Patient pathway

In [5]:
#this was made from the cancerdata_EDA.ipynb
patient_pathways = pd.read_pickle('patient_pathways.pkl')

#remove irrelevant columns
patient_pathways = patient_pathways.drop(columns=['PATIENTID',
                                                  'MERGED_PATIENT_ID_x',
                                                  'MERGED_TUMOUR_ID_x',
                                                  'MERGED_REGIMEN_ID_x',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_OUTCOME_ID',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_TUMOUR_ID_y',
                                                  'MERGED_CYCLE_ID',
                                                  'MERGED_DRUG_DETAIL_ID',
                                                  'MERGED_PATIENT_ID',
                                                  'MERGED_REGIMEN_ID_y',
                                                  'LINKNUMBER'])

print("shape of patient_pathways ",patient_pathways.shape)

shape of patient_pathways  (2666148, 37)


# Assigning events

use the columns 'PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','CYCLE_NUMBER','DRUG_GROUP' for features of an event

find all unique events and label that event by a number

In [6]:
#all these operations are done to get a dataframe of the format displayed below
#which has all the events

#choose some basic columns of patient pathways
pp_df = patient_pathways[['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','DRUG_GROUP']]

#only 3 character for icd10 code
pp_df['PRIMARY_DIAGNOSIS']=pp_df['PRIMARY_DIAGNOSIS'].apply(lambda x: str(x)[:3])

#only include top20 frequent cancers
top20 = pp_df['PRIMARY_DIAGNOSIS'].value_counts()[:20].keys()
pp_df = pp_df.loc[pp_df['PRIMARY_DIAGNOSIS'].isin(top20)]

#get all unique rows (unique events)
events_df = pp_df.drop_duplicates()

#get PATIENTID is currently the index
#this makes it into a column and drops it
events_df = events_df.reset_index()
events_df = events_df.drop(columns=['PATIENTID'])

#label each unique event as 0,1,2,3,4,...
events_df['EVENT'] = events_df.index
events_df['EVENT'] = events_df['EVENT'].astype(str)

#make patientid into column
pp_df = pp_df.reset_index()

#merge pp_df and events_df to assign each row in pp_df an
#event label from events_df, this way of merging is so that
#the row order in pp_df remains the same, which we want because
#we want the sequence of events to be in the right order
pp_event_df = pd.merge(pp_df, events_df, how='left', 
                       on=['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','DRUG_GROUP'])

pp_event_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,PATIENTID,PRIMARY_DIAGNOSIS,BENCHMARK_GROUP,DRUG_GROUP,EVENT
0,10000283,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
1,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,NOT CHEMO,1
2,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,ABIRATERONE,2
3,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,RITUXIMAB,3
4,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,CYCLOPHOSPHAMIDE,4


In [7]:
pp_event_df.shape,events_df.shape,pp_df.shape

((2215580, 5), (91733, 4), (2215580, 4))

there are 91733 unique events with a unique combination of `'PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','DRUG_GROUP'` labelled as numbers from 0 to 91733

## word2vec

In [10]:
#Create a series which has the sequence of events for a given patient,
#PRIMARY_DIAGNOSIS is included for colour coding
sequences = pp_event_df.groupby(['PATIENTID','PRIMARY_DIAGNOSIS'])['EVENT'].agg(lambda x: list(x))

In [16]:
#examples of sequences of events
print(sequences[9])
print(sequences[3])

['10', '9', '43', '44', '45', '45', '9', '9']
['15', '15', '16', '15', '17', '16', '18', '15', '19', '16', '16', '20', '21', '22', '23', '24', '24', '25', '22', '26', '27', '28']


In [17]:
#converts all events to vectors
model = gensim.models.Word2Vec(sentences = list(sequences), size=100, window=5, workers =4)

In [18]:
model['10'] #the vector for event 10


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



array([ 1.13413823e+00,  4.57815565e-02,  6.07052267e-01,  1.41900325e+00,
        1.67970157e+00, -2.45227289e+00, -6.22312069e-01,  1.67265654e-01,
        1.27980828e+00, -3.33334476e-01,  1.39579451e+00,  9.05743718e-01,
        9.39975321e-01,  2.09922123e+00, -1.66812789e+00, -9.15059626e-01,
       -6.50744975e-01, -1.28156066e+00,  1.41370189e+00,  1.70315444e+00,
        1.23102176e+00,  2.57500619e-01,  3.87859643e-01, -1.62180877e+00,
       -1.28323901e+00, -8.73741448e-01,  8.00156713e-01,  9.75477636e-01,
        1.78240943e+00,  2.52466500e-01, -1.40482724e+00, -1.06742096e+00,
        2.92635411e-01,  1.23804557e+00, -1.21151090e+00, -2.56061405e-01,
        1.99553120e+00,  7.60461837e-02, -4.11325902e-01, -1.24247551e+00,
       -1.30183256e+00,  6.45274371e-02, -1.91065609e+00, -8.39364290e-01,
       -7.86997020e-01,  2.38226041e-01,  5.67539573e-01,  9.48587025e-04,
        1.10864973e+00,  2.85510659e+00, -1.15664530e+00, -3.61104876e-01,
       -1.88087076e-01,  

In [19]:
#word2vec only includes events that appeared more than 5 times
#these dataframes only include those events
event5_df = pd.DataFrame(list(model.wv.vocab.keys()), columns=['EVENT'])
pp_event5_df = pd.merge(pp_event_df[['PRIMARY_DIAGNOSIS','EVENT']],event5_df, on='EVENT')
pp_event5_df = pp_event5_df.drop_duplicates()

## Dimension reduction

Use PCA to reduce the size of the vectors and project onto an optimal 3D dimension where clusters of individual events may be identified

In [28]:
from sklearn.decomposition import PCA

In [21]:
X = model[model.wv.vocab]
pca = PCA(n_components=3)
result = pca.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [22]:
pp_event5_df['x'] = result[:, 0]
pp_event5_df['y'] = result[:, 1]
pp_event5_df['z'] = result[:, 2]

In [23]:
uniqcancer = pp_event5_df['PRIMARY_DIAGNOSIS'].unique()

In [24]:
#plot the events, colour coded by diagnosis

data=[]
color = np.random.seed(seed=20)
desc = [d for d in descriptions.get_descriptions(uniqcancer, 'icd')]
for cancer in uniqcancer:
    color=np.random.randint(255, size=(1, 3))[0]
    x = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['x']
    y = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['y']
    z = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['z']
    desc = descriptions.get_descriptions(cancer, 'icd')[0]
    trace = go.Scatter3d( x = x,
                        y = y,
                        z = z,
                        mode = 'markers',
                        name = desc,
                        marker = dict(size = 3,
                                      color = 'rgb({}, {}, {})'.format(*color)) )
    data.append(trace)

layout = dict(title = 'Visualisation of events',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
po.iplot(fig)

Double click on one of the codes in the legend to look at a specific cancer

the events corresponding to breast and lung cancer seem to be distinctly separated in this 3D space

### patient pathway visualization

Use word2vec with vector size = 3

Add up all the events for each sequence and plot the pathway

In [26]:
model2d = gensim.models.Word2Vec(sentences = list(sequences), size=3, window=5, workers =4,min_count=1)

In [27]:
data=[]

#color code by cancer type
colors = np.random.randint(255, size=(20, 3))
cancercolor = {cancer:color for cancer,color in zip(uniqcancer,colors)}
cancers = sequences.index.get_level_values(1)

npatients = 1000

for s,c in zip(sequences[:npatients], cancers[:npatients]):
    pathway = []
    coord = np.zeros(3)
    
    
    for i in range(len(s)):
        coord += model2d[s[i]]
        
        pathway.append(list(coord))
    p = np.array(pathway)
    
    col = cancercolor[c]
    
    trace = go.Scatter3d( x = p[:,0],
                        y = p[:,1],
                        z = p[:,2],
                        mode = 'lines',
                        name = c,
                        line=dict(width=3,color='rgb({}, {}, {})'.format(*col)) )
    data.append(trace)

layout = dict(title = 'Visualisation of pathways',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
po.iplot(fig)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



The pathways look like they're clustered well according to the type of cancer, which means a clustering algorithm should work well.