In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.offline as po
import plotly.graph_objs as go
from plotly import tools
import load
import descriptions
import gensim
po.init_notebook_mode(connected=True)

## Patient pathway

In [2]:
#this was made from the cancerdata_EDA.ipynb
patient_pathways = pd.read_pickle('patient_pathways.pkl')

#remove irrelevant columns
patient_pathways = patient_pathways.drop(columns=['PATIENTID',
                                                  'MERGED_PATIENT_ID_x',
                                                  'MERGED_TUMOUR_ID_x',
                                                  'MERGED_REGIMEN_ID_x',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_OUTCOME_ID',
                                                  'MERGED_PATIENT_ID_y',
                                                  'MERGED_TUMOUR_ID_y',
                                                  'MERGED_CYCLE_ID',
                                                  'MERGED_DRUG_DETAIL_ID',
                                                  'MERGED_PATIENT_ID',
                                                  'MERGED_REGIMEN_ID_y',
                                                  'LINKNUMBER'])

print("shape of patient_pathways ",patient_pathways.shape)

shape of patient_pathways  (2666148, 37)


# Assigning events

use the columns 'PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','CYCLE_NUMBER','DRUG_GROUP' for features of an event

find all unique events and label that event by a number

In [3]:
pp_df = patient_pathways[['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','CYCLE_NUMBER','DRUG_GROUP']]
pp_df['PRIMARY_DIAGNOSIS']=pp_df['PRIMARY_DIAGNOSIS'].apply(lambda x: str(x)[:3])
pp_df = pp_df.drop(columns=['CYCLE_NUMBER'])
top20 = pp_df['PRIMARY_DIAGNOSIS'].value_counts()[:20].keys()
pp_df = pp_df.loc[pp_df['PRIMARY_DIAGNOSIS'].isin(top20)]
events_df = pp_df.drop_duplicates()

pp_df = pp_df.reset_index()
events_df = events_df.reset_index()
events_df = events_df.drop(columns=['PATIENTID'])
events_df['EVENT'] = events_df.index
events_df['EVENT'] = events_df['EVENT'].astype(str)
pp_event_df = pd.merge(pp_df,events_df,on=['PRIMARY_DIAGNOSIS','BENCHMARK_GROUP','DRUG_GROUP'])
pp_event_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,PATIENTID,PRIMARY_DIAGNOSIS,BENCHMARK_GROUP,DRUG_GROUP,EVENT
0,10000283,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
1,10289804,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
2,10289804,C61,ZOLEDRONIC ACID,CHLORAMBUCIL,0
3,10000283,C61,FLUOROURACIL + MITOMYCIN + RT,NOT CHEMO,1
4,10151044,C61,FLUOROURACIL + MITOMYCIN + RT,NOT CHEMO,1


In [4]:
len(pp_event_df),len(events_df),len(pp_df)

(2215580, 91733, 2215580)

## word2vec

In [5]:
sequences = pp_event_df.groupby('PATIENTID')['EVENT'].agg(lambda x: list(x))

In [6]:
model = gensim.models.Word2Vec(sentences = list(sequences), size=100, window=5, workers =4)

In [7]:
model['10'] #the vector for event 10


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



array([-2.070517  ,  2.1879508 ,  1.1154172 ,  0.35213077,  1.0673561 ,
        1.4193308 ,  0.84004897,  1.0083255 ,  1.6338832 ,  0.50554574,
       -2.6780105 ,  1.5354341 ,  0.6448955 , -1.369434  ,  1.475701  ,
        0.47440708,  0.37967268,  0.38750654, -1.0180875 , -0.8660226 ,
        0.6240822 , -1.7118249 , -0.20602542,  0.5371956 , -0.06996764,
       -0.4879148 ,  0.25816095, -0.04329139, -1.4241064 , -0.29733062,
       -0.25143945, -0.0339114 , -0.16636772,  0.6360401 ,  1.3258208 ,
       -1.244749  , -1.2894332 ,  0.5003014 , -1.6456642 , -1.9607785 ,
        0.08774464,  0.17658359, -0.44989017, -1.2542211 , -1.01085   ,
        1.3134317 ,  0.700614  ,  0.07089689,  1.7683525 , -0.07408204,
       -1.4409796 , -1.6694467 , -2.0074744 , -0.14236467, -1.5577667 ,
        0.8357121 ,  1.560373  , -1.4060416 , -0.39185703,  0.54024273,
        0.47439316,  0.5604671 ,  0.29455724, -1.0488075 , -1.1535045 ,
        1.1271003 , -0.48722482, -1.2999704 , -1.4232239 ,  0.10

In [9]:
#word2vec only includes events that appeared more than 5 times
#these dataframes only include those events
event5_df = pd.DataFrame(list(model.wv.vocab.keys()), columns=['EVENT'])
pp_event5_df = pd.merge(pp_event_df[['PRIMARY_DIAGNOSIS','EVENT']],event5_df, on='EVENT')
pp_event5_df = pp_event5_df.drop_duplicates()

## Dimension reduction

In [10]:
from sklearn.decomposition import PCA

In [11]:
X = model[model.wv.vocab]
pca = PCA(n_components=3)
result = pca.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [13]:
pp_event5_df['x'] = result[:, 0]
pp_event5_df['y'] = result[:, 1]
pp_event5_df['z'] = result[:, 2]

In [14]:
uniqcancer = pp_event5_df['PRIMARY_DIAGNOSIS'].unique()

In [15]:

data=[]
color = np.random.seed(seed=20)
for cancer in uniqcancer:
    color=np.random.randint(255, size=(1, 3))[0]
    x = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['x']
    y = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['y']
    z = pp_event5_df[pp_event5_df['PRIMARY_DIAGNOSIS']==cancer]['z']
    trace = go.Scatter3d( x = x,
                        y = y,
                        z = z,
                        mode = 'markers',
                        name = cancer,
                        marker = dict(size = 3,
                                      color = 'rgb({}, {}, {})'.format(*color)) )
    data.append(trace)

layout = dict(title = 'Events colour coded by cancer',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
po.iplot(fig)

Double click on one of the codes in the legend to look at a specific cancer

In [16]:
desc = descriptions.get_descriptions(uniqcancer, 'icd')
for i,j in zip(uniqcancer,desc):
    print(i," ",j)

C61   MALIGNANT NEOPLASM OF PROSTATE
C50   MALIGNANT NEOPLASM OF BREAST
C15   MALIGNANT NEOPLASM OF OESOPHAGUS
C83   NON-HODGKIN"S LYMPHOMA
C34   MALIGNANT NEOPLASM OF LUNG
C91   LEUKAEMIA
C82   NON-HODGKIN"S LYMPHOMA
C56   MALIGNANT NEOPLASM OF OVARY
C90   MYELOMA
C25   MALIGNANT NEOPLASM OF PANCREAS
C16   MALIGNANT NEOPLASM OF STOMACH
C67   MALIGNANT NEOPLASM OF BLADDER
C85   NON-HODGKIN"S LYMPHOMA
C18   MALIGNANT NEOPLASM OF COLON
C20   MALIGNANT NEOPLASM OF RECTUM
C92   LEUKAEMIA
C81   HODGKIN"S DISEASE
C45   MESOTHELIOMA
C71   MALIGNANT NEOPLASM OF BRAIN
C53   MALIGNANT NEOPLASM OF CERVIX UTERI
