# Correlations

This notebook is for looking at various methods of investagiting correlation within the data.

In [116]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls

import load
import descriptions as desc

Load in datasets and show top 10 most common / deadly cancers as useful reference

In [117]:
df_av_patient = load.load_table('av_patient', add_descriptions=True) 
# add_descriptions=True adds columns of descriptions to the codes

In [118]:
df_av_tumour = load.load_table('av_tumour', add_descriptions=True)

In [119]:
df_dead = df_av_patient[df_av_patient["NEWVITALSTATUS"] == "D"]
vc_loc_dead = df_dead["DEATHLOCATIONCODE"].value_counts()

In [120]:
vc_cancers = df_av_tumour['SITE_ICD10_O2_3CHAR'].value_counts()
topTenCancers = vc_cancers[:10].keys()

print("---------------------------")
print("Top ten most common cancers")
print("---------------------------")
for i in topTenCancers:
    print(i + ' -- ' + desc.get_deathcause_description(i)[0])

---------------------------
Top ten most common cancers
---------------------------
C44 -- Other malignant neoplasms of skin
C50 -- Malignant neoplasm of breast
C61 -- Malignant neoplasm of prostate
C34 -- Malignant neoplasm of bronchus and lung
D06 -- Carcinoma in situ of cervix uteri
C18 -- Malignant neoplasm of colon
C43 -- Malignant melanoma of skin
C20 -- Malignant neoplasm of rectum
C64 -- Malignant neoplasm of kidney, except renal pelvis
D09 -- Carcinoma in situ of other and unspecified sites


In [121]:
vc_dead = df_dead["DEATHCAUSECODE_UNDERLYING"].value_counts(dropna=True)
topTenDeadliest = vc_dead[:10].keys()

print("---------------------------")
print("Top ten deadliest cancers")
print("---------------------------")
for i in topTenDeadliest:
    print(i + ' -- ' + desc.get_deathcause_description(i)[0])
    
# C80 appears twice, once as C800 (site unknown) and the 
# other as C809 (site unspecified). 

---------------------------
Top ten deadliest cancers
---------------------------
C349 -- Bronchus or lung, unspecified
C259 -- Pancreas, unspecified
C159 -- Oesophagus, unspecified
C800 -- 
C719 -- Brain, unspecified
C169 -- Stomach, unspecified
C189 -- Colon, unspecified
C221 -- Intrahepatic bile duct carcinoma
C679 -- Bladder, unspecified
C809 -- 


Define some functions

In [122]:
def fisher(dataset, var1, var2):
    
    """
    Description
    -----------
    Perform Fisher's Exacr test to see if there is a correlation between
    var1[0] having value var1[1] and var2[0] having value var2[1].
    
    Arguments
    -----------  
    dataset [df] - 
    var1  [list] - Name and value of variable 1 (e.g. ["DEATHCAUSECODE_UNDERLYING", "C30"] )
    var2  [list] - Similar to var 1 (e.g. ["DEATHLOCATIONCODE", "5"])
    
    
    Example Usage
    ----------- 
    fisher(, , )
    """
    
    var1_name = var1[0]
    var1_value = var1[1]
    
    var2_name = var2[0]
    var2_value = var2[1]
    
    # Names below follow convention used here: 
    #     https://www.sheffield.ac.uk/polopoly_fs/1.43998!/file/tutorial-9-fishers.pdf
    
    a = dataset[(dataset[var1_name] == var1_value) & (dataset[var2_name] == var2_value)].shape[0]
    b = dataset[(dataset[var1_name] != var1_value) & (dataset[var2_name] == var2_value)].shape[0]
    c = dataset[(dataset[var1_name] == var1_value) & (dataset[var2_name] != var2_value)].shape[0]
    d = dataset[(dataset[var1_name] != var1_value) & (dataset[var2_name] != var2_value)].shape[0]
    
    if (a!=0) and (b!=0) and (c!=0) and (d!=0):
        oddsratio, pvalue = stats.fisher_exact([[a, b], [c, d]])
        #print(str(a) + "," + str(b) + ',' + str(c) + ',' + str(d))
        return pvalue
    
    else:
        return "NaN"

In [123]:
# Investigate death location of pancreatic cancer patients (C259)
location_codes = ["1", "2", "3", "4", "4077", "4087", "4097",
                  "4107", "4117", "4127", "4137", "5", "X"]

for location in location_codes:
    pvalue = fisher(df_dead, ["DEATHCAUSECODE_UNDERLYING", "C259"], ["DEATHLOCATIONCODE", location])
    print("Location: " + desc.get_descriptions(location, "deathlocation")[0] + " (" + location + ")" )
    print("p-value: " + str(pvalue))
    print("--------")

Location: HOSPITAL (1)
p-value: 1.3352191287300447e-234
--------
Location: PRIVATE HOME (2)
p-value: 0.0
--------
Location: HOSPICE NOS (3)
p-value: 0.7952674113427535
--------
Location: NURSING HOME (4)
p-value: 0.0
--------
Location: NHS HOSPICE / SPECIALIST PALLIATIVE CARE UNIT (4077)
p-value: NaN
--------
Location: VOLUNTARY HOSPICE / SPECIALIST PALLIATIVE CARE UNIT (4087)
p-value: NaN
--------
Location: CARE HOME (4097)
p-value: NaN
--------
Location: PATIENT"S OWN HOME (4107)
p-value: NaN
--------
Location: OTHER PRIVATE RESIDENCE (E.G. RELATIVES HOME, CARERS HOME) (4117)
p-value: NaN
--------
Location: CARE HOME WITH NURSING (4127)
p-value: NaN
--------
Location: CARE HOME WITHOUT NURSING (4137)
p-value: NaN
--------
Location: OTHER (5)
p-value: 1.1524342982259484e-16
--------
Location: UNKNOWN (X)
p-value: 4.382751374553357e-137
--------


In [124]:
def cancers_by_deathlocation(locationCode):

    """
    Description
    -----------
    Returns the number of patients who died of different cancers at locationCode

    i.e. cancers_by_deathlocation("2") returns number of deaths by cancer for
    patients who died at PRIVATE HOME

    Arguments
    -----------
    locationCode [str] - Location code, refer to lookup_tables/zdeathlocation.csv
    """
    
    
    df_diedAtLoc = df_dead[df_dead["DEATHLOCATIONCODE"] == locationCode]
    vc_diedAtLoc = df_diedAtLoc["DEATHCAUSECODE_UNDERLYING"].value_counts(dropna=True)
    
    return vc_diedAtLoc

In [125]:
df_dead[df_dead["DEATHCAUSECODE_UNDERLYING"] == "C259"]["DEATHLOCATIONCODE"].value_counts()

4    4815
1    4306
2    2311
X    1186
5     477
6     239
3      14
Name: DEATHLOCATIONCODE, dtype: int64