In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.offline as po
import plotly.graph_objs as go
from plotly import tools
import load
import descriptions
import gensim
po.init_notebook_mode(connected=True)

## Patient pathway

In [2]:
#this was made from the cancerdata_EDA.ipynb
patient_pathways = pd.read_pickle('patient_pathways.pkl')

#remove irrelevant columns
patient_pathways = patient_pathways.drop(columns=['PATIENTID',
                                                  'MERGED_PATIENT_ID_x',
                                                  'MERGED_TUMOUR_ID_x',
                                                  'MERGED_REGIMEN_ID_x',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_OUTCOME_ID',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_TUMOUR_ID_y',
                                                  'MERGED_CYCLE_ID',
                                                  'MERGED_DRUG_DETAIL_ID',
                                                  'MERGED_PATIENT_ID',
                                                  'MERGED_REGIMEN_ID_y',
                                                  'LINKNUMBER'])

print("shape of patient_pathways ",patient_pathways.shape)

shape of patient_pathways  (2666148, 37)


# Assigning events

use the columns 'PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','CYCLE_NUMBER','DRUG_GROUP' for features of an event

find all unique events and label that event by a number

In [3]:
#all these operations are done to get a dataframe of the format displayed below
#which has all the events
pp_df = patient_pathways[['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','CYCLE_NUMBER','DRUG_GROUP']]
pp_df['PRIMARY_DIAGNOSIS']=pp_df['PRIMARY_DIAGNOSIS'].apply(lambda x: str(x)[:3])
pp_df = pp_df.drop(columns=['CYCLE_NUMBER'])
top20 = pp_df['PRIMARY_DIAGNOSIS'].value_counts()[:20].keys()
pp_df = pp_df.loc[pp_df['PRIMARY_DIAGNOSIS'].isin(top20)]
events_df = pp_df.drop_duplicates()

# some comments to understand code here would be useful
pp_df = pp_df.reset_index()
events_df = events_df.reset_index()
events_df = events_df.drop(columns=['PATIENTID'])
events_df['EVENT'] = events_df.index
events_df['EVENT'] = events_df['EVENT'].astype(str)
pp_event_df = pd.merge(pp_df,events_df,on=['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','DRUG_GROUP'])
pp_event_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,PATIENTID,PRIMARY_DIAGNOSIS,BENCHMARK_GROUP,DRUG_GROUP,EVENT
0,10000283,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
1,10289804,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
2,10289804,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
3,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,NOT CHEMO,1
4,10151044,C61,FLUOROURACIL + MITOMYCIN + RT,NOT CHEMO,1


In [4]:
pp_event_df.shape,events_df.shape,pp_df.shape

((2215580, 5), (91733, 4), (2215580, 4))

there are 91733 events, labelled as numbers from 0 to 91733

## word2vec

In [5]:
sequences = pp_event_df.groupby(['PATIENTID','PRIMARY_DIAGNOSIS'])['EVENT'].agg(lambda x: list(x))

In [21]:
sequences[3] #example sequence of events

['15',
 '15',
 '15',
 '15',
 '16',
 '16',
 '16',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '22',
 '23',
 '24',
 '24',
 '25',
 '26',
 '27',
 '28']

In [6]:
#converts all events to vectors
model = gensim.models.Word2Vec(sentences = list(sequences), size=100, window=5, workers =4)

In [7]:
model['10'] #the vector for event 10


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



array([-0.6349146 ,  1.4527951 , -0.52461696, -0.9683653 , -1.6050581 ,
       -0.7216421 ,  1.7144918 ,  2.1986208 ,  0.6096469 , -1.754399  ,
        0.16292363, -0.7134382 ,  0.18598525, -0.6202344 ,  0.49318978,
        0.66922635, -1.2863034 , -0.00446886,  2.1931973 , -0.9273    ,
       -2.573157  , -0.69492185,  2.3083227 , -0.04286405, -1.795769  ,
        3.091147  ,  2.4440084 , -0.39755625, -1.2112429 , -0.07731058,
       -1.5257987 , -0.1602935 ,  0.85970867,  2.0228639 ,  0.82392985,
       -1.7847369 , -1.0129341 , -0.02323162,  1.4103146 ,  0.8664886 ,
        1.2214876 ,  0.85917133,  1.6136327 ,  0.249656  , -0.2766041 ,
       -1.098001  , -0.1523043 ,  0.15152632,  0.01353143,  1.8275948 ,
        1.4243797 ,  1.1307838 ,  1.8390946 ,  0.02115394, -0.72782964,
        0.29911262, -0.27320588,  1.9106115 ,  1.2415824 ,  1.3297783 ,
       -1.1364828 , -1.0239254 ,  0.8122054 ,  0.5937442 ,  0.08182777,
       -1.10098   ,  1.0577959 , -1.5940266 ,  0.39835715,  0.86

In [8]:
#word2vec only includes events that appeared more than 5 times
#these dataframes only include those events
event5_df = pd.DataFrame(list(model.wv.vocab.keys()), columns=['EVENT'])
pp_event5_df = pd.merge(pp_event_df[['PRIMARY_DIAGNOSIS','EVENT']],event5_df, on='EVENT')
pp_event5_df = pp_event5_df.drop_duplicates()

## Dimension reduction

Use PCA to reduce the size of the vectors and project onto an optimal 3D dimension where clusters of individual events may be identified

In [9]:
from sklearn.decomposition import PCA

In [10]:
X = model[model.wv.vocab]
pca = PCA(n_components=3)
result = pca.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [11]:
pp_event5_df['x'] = result[:, 0]
pp_event5_df['y'] = result[:, 1]
pp_event5_df['z'] = result[:, 2]

In [12]:
uniqcancer = pp_event5_df['PRIMARY_DIAGNOSIS'].unique()

In [13]:

data=[]
color = np.random.seed(seed=20)
desc = [d for d in descriptions.get_descriptions(uniqcancer, 'icd')]
for cancer in uniqcancer:
    color=np.random.randint(255, size=(1, 3))[0]
    x = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['x']
    y = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['y']
    z = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['z']
    desc = descriptions.get_descriptions(cancer, 'icd')[0]
    trace = go.Scatter3d( x = x,
                        y = y,
                        z = z,
                        mode = 'markers',
                        name = desc,
                        marker = dict(size = 3,
                                      color = 'rgb({}, {}, {})'.format(*color)) )
    data.append(trace)

layout = dict(title = 'Visualisation of events',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
po.iplot(fig)

Double click on one of the codes in the legend to look at a specific cancer

the events corresponding to breast and lung cancer seem to be distinctly separated in this 3D space

### patient pathway visualization

Use word2vec with vector size = 3

Add up all the events for each sequence and plot the pathway

In [14]:
model2d = gensim.models.Word2Vec(sentences = list(sequences), size=3, window=5, workers =4,min_count=1)

In [16]:
data=[]

#color code by cancer type
colors = np.random.randint(255, size=(20, 3))
cancercolor = {cancer:color for cancer,color in zip(uniqcancer,colors)}
cancers = sequences.index.get_level_values(1)

npatients = 1000

for s,c in zip(sequences[:npatients], cancers[:npatients]):
    pathway = []
    coord = np.zeros(3)
    
    
    for i in range(len(s)):
        coord += model2d[s[i]]
        
        pathway.append(list(coord))
    p = np.array(pathway)
    
    col = cancercolor[c]
    
    trace = go.Scatter3d( x = p[:,0],
                        y = p[:,1],
                        z = p[:,2],
                        mode = 'lines',
                        name = c,
                        line=dict(width=3,color='rgb({}, {}, {})'.format(*col)) )
    data.append(trace)

layout = dict(title = 'Visualisation of pathways',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
po.iplot(fig)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



The pathways look like they're clustered well according to the type of cancer, which means a clustering algorithm should work well.