In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.offline as po
import plotly.graph_objs as go
from plotly import tools
import load
import descriptions
import gensim
po.init_notebook_mode(connected=True)

## Patient pathway

In [68]:
#this was made from the cancerdata_EDA.ipynb
patient_pathways = pd.read_pickle('patient_pathways.pkl')

#remove irrelevant columns
patient_pathways = patient_pathways.drop(columns=['PATIENTID',
                                                  'MERGED_PATIENT_ID_x',
                                                  'MERGED_TUMOUR_ID_x',
                                                  'MERGED_REGIMEN_ID_x',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_OUTCOME_ID',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_TUMOUR_ID_y',
                                                  'MERGED_CYCLE_ID',
                                                  'MERGED_DRUG_DETAIL_ID',
                                                  'MERGED_PATIENT_ID',
                                                  'MERGED_REGIMEN_ID_y',
                                                  'LINKNUMBER'])

print("shape of patient_pathways ",patient_pathways.shape)

shape of patient_pathways  (2666148, 46)


# Assigning events

use the columns 'PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','CYCLE_NUMBER','DRUG_GROUP' for features of an event

find all unique events and label that event by a number

In [92]:
#all these operations are done to get a dataframe of the format displayed below
#which has all the events
pp_df = patient_pathways[['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','CYCLE_NUMBER','DRUG_GROUP']]
pp_df['PRIMARY_DIAGNOSIS']=pp_df['PRIMARY_DIAGNOSIS'].apply(lambda x: str(x)[:3])
pp_df = pp_df.drop(columns=['CYCLE_NUMBER'])
top20 = pp_df['PRIMARY_DIAGNOSIS'].value_counts()[:20].keys()
pp_df = pp_df.loc[pp_df['PRIMARY_DIAGNOSIS'].isin(top20)]
events_df = pp_df.drop_duplicates()

# some comments to understand code here would be useful
pp_df = pp_df.reset_index()
events_df = events_df.reset_index()
events_df = events_df.drop(columns=['PATIENTID'])
events_df['EVENT'] = events_df.index
events_df['EVENT'] = events_df['EVENT'].astype(str)

# add medical event number to pp_df (a patient can have the same event number more than once)
pp_event_df = pd.merge(pp_df, events_df, 
                       how='left', # preserve order of pp_df
                       on=['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','DRUG_GROUP'])
pp_event_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,PATIENTID,PRIMARY_DIAGNOSIS,BENCHMARK_GROUP,DRUG_GROUP,EVENT
0,10000283,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
1,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,NOT CHEMO,1
2,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,ABIRATERONE,2
3,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,RITUXIMAB,3
4,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,CYCLOPHOSPHAMIDE,4


In [87]:
pp_df.head()

Unnamed: 0,PATIENTID,PRIMARY_DIAGNOSIS,BENCHMARK_GROUP,DRUG_GROUP
0,10000283,C61,ZOLEDRONIC ACID,CHLORAMBUCIL
1,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,NOT CHEMO
2,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,ABIRATERONE
3,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,RITUXIMAB
4,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,CYCLOPHOSPHAMIDE


In [93]:
pp_event_df.shape,events_df.shape,pp_df.shape

((2215580, 5), (91733, 4), (2215580, 4))

there are 91733 events, labelled as numbers from 0 to 91733

## word2vec

In [94]:
sequences = pp_event_df.groupby(['PATIENTID','PRIMARY_DIAGNOSIS'])['EVENT'].agg(lambda x: list(x))

In [95]:
sequences[3] #example sequence of events

['15',
 '15',
 '16',
 '15',
 '17',
 '16',
 '18',
 '15',
 '19',
 '16',
 '16',
 '20',
 '21',
 '22',
 '23',
 '24',
 '24',
 '25',
 '22',
 '26',
 '27',
 '28']

In [96]:
#converts all events to vectors
model = gensim.models.Word2Vec(sentences = list(sequences), size=100, window=5, workers =4)

In [97]:
model['10'] #the vector for event 10


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



array([ 3.1500861e-01,  7.2295435e-02, -4.9471113e-01,  1.4954216e+00,
        1.2623416e+00,  1.3976128e+00,  1.6599312e+00, -1.0113187e+00,
        3.0863936e+00, -1.6054591e-02, -1.2652625e+00,  1.0033858e+00,
       -1.7524244e+00, -1.5878116e+00,  1.1844056e+00, -6.2697864e-01,
       -6.3417470e-01,  7.2640306e-01, -3.4100798e-01, -6.8081456e-01,
       -2.9588094e-01, -6.1914361e-01, -7.1626806e-01,  6.3908666e-01,
       -2.0048306e+00,  2.9872847e-01, -1.6571507e+00,  4.1623363e-01,
        8.8586077e-02, -2.3977489e+00,  3.0049026e-01, -1.4233612e+00,
       -1.6180042e+00,  1.2411234e+00,  2.1040231e-01,  1.0662171e+00,
       -1.0294812e+00,  1.4815892e+00,  1.0885090e+00, -1.0116651e+00,
       -2.3947719e-01,  1.6433041e+00,  1.7557422e+00, -5.6736308e-01,
        5.7995570e-01,  4.6351096e-01, -9.3725795e-01, -2.6907375e+00,
       -8.7932479e-01, -6.9069248e-01, -1.8033792e-01,  4.9713394e-01,
       -2.9289719e-01, -5.6344384e-01, -5.7697082e-01, -3.0040470e-01,
      

In [98]:
#word2vec only includes events that appeared more than 5 times
#these dataframes only include those events
event5_df = pd.DataFrame(list(model.wv.vocab.keys()), columns=['EVENT'])
pp_event5_df = pd.merge(pp_event_df[['PRIMARY_DIAGNOSIS','EVENT']],event5_df, on='EVENT')
pp_event5_df = pp_event5_df.drop_duplicates()

## Dimension reduction

Use PCA to reduce the size of the vectors and project onto an optimal 3D dimension where clusters of individual events may be identified

In [99]:
from sklearn.decomposition import PCA

In [100]:
X = model[model.wv.vocab]
pca = PCA(n_components=3)
result = pca.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [101]:
pp_event5_df['x'] = result[:, 0]
pp_event5_df['y'] = result[:, 1]
pp_event5_df['z'] = result[:, 2]

In [102]:
uniqcancer = pp_event5_df['PRIMARY_DIAGNOSIS'].unique()

In [104]:

data=[]
color = np.random.seed(seed=20)
desc = [d for d in descriptions.get_descriptions(uniqcancer, 'icd')]
for cancer in uniqcancer:
    color=np.random.randint(255, size=(1, 3))[0]
    x = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['x']
    y = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['y']
    z = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['z']
    desc = descriptions.get_descriptions(cancer, 'icd')[0]
    trace = go.Scatter3d( x = x,
                        y = y,
                        z = z,
                        mode = 'markers',
                        name = desc,
                        marker = dict(size = 3,
                                      color = 'rgb({}, {}, {})'.format(*color)) )
    data.append(trace)

layout = dict(title = 'Visualisation of events',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
po.iplot(fig)

Double click on one of the codes in the legend to look at a specific cancer

the events corresponding to breast and lung cancer seem to be distinctly separated in this 3D space

### patient pathway visualization

Use word2vec with vector size = 3

Add up all the events for each sequence and plot the pathway

In [105]:
model2d = gensim.models.Word2Vec(sentences = list(sequences), size=3, window=5, workers =4,min_count=1)

In [106]:
data=[]

#color code by cancer type
colors = np.random.randint(255, size=(20, 3))
cancercolor = {cancer:color for cancer,color in zip(uniqcancer,colors)}
cancers = sequences.index.get_level_values(1)

npatients = 1000

for s,c in zip(sequences[:npatients], cancers[:npatients]):
    pathway = []
    coord = np.zeros(3)
    
    
    for i in range(len(s)):
        coord += model2d[s[i]]
        
        pathway.append(list(coord))
    p = np.array(pathway)
    
    col = cancercolor[c]
    
    trace = go.Scatter3d( x = p[:,0],
                        y = p[:,1],
                        z = p[:,2],
                        mode = 'lines',
                        name = c,
                        line=dict(width=3,color='rgb({}, {}, {})'.format(*col)) )
    data.append(trace)

layout = dict(title = 'Visualisation of pathways',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
po.iplot(fig)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



The pathways look like they're clustered well according to the type of cancer, which means a clustering algorithm should work well.