# Correlations

This notebook is for looking at various methods of investagiting correlation within the data.

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [9]:
av_patient = pd.read_csv('./simulacrum_release_v1.1.0/sim_av_patient.csv')

In [10]:
dead = av_patient[av_patient["NEWVITALSTATUS"] == "D"]
loc_dead_vc = dead["DEATHLOCATIONCODE"].value_counts()

In [12]:
vc = av_patient['DEATHCAUSECODE_UNDERLYING'].value_counts()
topTenCancers = vc[:10].keys()
print(topTenCancers)

Index(['C349', 'C259', 'C159', 'C61', 'C189', 'C509', 'C679', 'C800', 'C169',
       'C719'],
      dtype='object')


In [13]:
dead_vc = dead["DEATHCAUSECODE_UNDERLYING"].value_counts(dropna=True)
topTenDeadliest = dead_vc[:10].keys()
print(topTenDeadliest)

Index(['C349', 'C259', 'C159', 'C800', 'C719', 'C169', 'C189', 'C221', 'C679',
       'C809'],
      dtype='object')


In [18]:
def cancers_by_deathlocation(locationCode):

"""
Description
-----------
Returns the number of patients who died of different cancers at locationCode

i.e. cancers_by_deathlocation("2") returns number of deaths by cancer for patients who died at PRIVATE HOME

Arguments
-----------
locationCode [str] - Location code, refer to lookup_tables/zdeathlocation.csv
"""
    
    
    diedAtLoc = dead[dead["DEATHLOCATIONCODE"] == locationCode]
    diedAtLoc_vc = diedAtLoc["DEATHCAUSECODE_UNDERLYING"].value_counts(dropna=True)
    
    return diedAtLoc_vc

In [27]:
def deviation(subset_vc, genPop_vc):

"""
Description
-----------
Function to investigate correlations/differences between data in a specified subset 
and the same data in the general population.

A z-score is calculated using sqrt(n_counts) as the standard deviation. ***NOTE*** - this is not meaningful if
the number of patients in a given subset (see "Patients in subset: ") is low.

Example usage:- deviation(cancers_by_deathlocation("5"), dead_vc) compares the rate of incidence of cancers 
among those who died in NURSING HOME to the rate of incidence of cancer in the general population
(i.e. everyone who died)


Arguments
-----------

subset_vc [value_counts] - Value counts in subset of interest
genPop_vc [value_counts] - Value counts in general population
"""    
    
    
    n_subset = sum(subset_vc.tolist()) #number of people in subset e.g. people who died in nursing home
    n_genPop = sum(genPop_vc.tolist()) #number of people in "General Population" e.g. all people who died
    rescaleFactor = float(n_subset)/float(n_genPop)
    
    
    print("Total size of subset: " + str(n_subset))
    print("Total size of general population: " + str(n_genPop))
    print("--------------------------------------")
    
    
    for i in subset_vc.keys():
        
        rescaled_genPop = genPop_vc[i]*rescaleFactor #number of people in genPop matching criteria i, rescaled to size of subset
        subset = subset_vc[i] #number of people in subset matching critera i
        stdDev = np.sqrt(subset) #poissonian standard deviation
        
        if stdDev != 0:
            z_score = (subset - rescaled_genPop) / stdDev
        else:
            z_score = "NaN"
        
        print(i + "---- " + "Patients in subset: " + str(subset) + ", z-score: " + str(z_score))

In [38]:
deviation(cancers_by_deathlocation("5"), dead_vc)

Total size of subset: 7055
Total size of general population: 148399
--------------------------------------
C349---- Patients in subset: 2049, z-score: -0.7269182478869998
C259---- Patients in subset: 477, z-score: -7.236583081601007
C719---- Patients in subset: 450, z-score: 10.890731271635458
C159---- Patients in subset: 413, z-score: -2.057944328517344
C800---- Patients in subset: 262, z-score: 0.44955311178567464
C189---- Patients in subset: 231, z-score: 4.119478839163136
C679---- Patients in subset: 203, z-score: 3.6170595150743234
C169---- Patients in subset: 203, z-score: -0.8407805470173629
C221---- Patients in subset: 143, z-score: -0.9424229205668775
C80---- Patients in subset: 133, z-score: 0.3858472543921058
C61---- Patients in subset: 131, z-score: 3.8443349407684617
C809---- Patients in subset: 125, z-score: -0.9681214966512746
C64---- Patients in subset: 110, z-score: 1.5674735044393888
C260---- Patients in subset: 106, z-score: 2.7782327033096217
C56---- Patients in sub