# Healthcare Stats, data analysis

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
df = pd.read_csv('./DATA/DATA copy.csv')
df.head()

Unnamed: 0,LastName,FirstName,DOB,Age,Vocation,Smoke,HR,sBP,CholesterolBefore,Cholesterol_b4_level,CholesterolAfter,TAG,Survey,Delta,Group,year
0,Patton,Dylan,1981-10-07,45,Energy manager,0,47,145,1.2,low,0.7,1.2,1,0.5,1,1981-10-07
1,Howard,Sandra,1993-01-27,55,Tax adviser,0,51,115,1.2,low,1.0,0.6,3,0.2,1,1993-01-27
2,Williams,Samantha,1973-12-21,35,IT consultant,0,54,120,2.0,low,1.7,1.3,3,0.3,1,1973-12-21
3,Hensley,Ashley,1981-12-01,45,"Nurse, children's",0,54,103,2.1,low,2.1,1.6,4,0.0,1,1981-12-01
4,Wilson,Robert,1964-06-23,48,Clinical embryologist,0,61,138,2.8,low,2.8,2.1,5,0.0,1,1964-06-23


In [4]:
df.drop(columns=['year'], inplace=True)

In [6]:
df.shape

(200, 15)

In [7]:
df.columns

Index(['LastName', 'FirstName', 'DOB', 'Age', 'Vocation', 'Smoke', 'HR', 'sBP',
       'CholesterolBefore', 'Cholesterol_b4_level', 'CholesterolAfter', 'TAG',
       'Survey', 'Delta', 'Group'],
      dtype='object')

In [8]:
df.Group.unique()

array([1, 0])

### Group column has Active and Control, 0 = Control (placebo), 1= Active (drug).

In [32]:
groups = df.Group.value_counts()
print('Active group {}'.format(groups[1]))
print('Ctrl group {}'.format(groups[0]))

Active group 100
Ctrl group 100


### Convert group column counts to Relative Frequency using <code>normalize=True</code>

In [36]:
relfreq = df.Group.value_counts(normalize=True) * 100

print('Active group normalized {}'.format(relfreq[1]))
print('Ctrl group normalized   {}'.format(relfreq[0]))

Active group normalized 50.0
Ctrl group normalized   50.0


### Convert Smoke column values to normalized value counts

In [41]:
smoke_relfreq = df.Smoke.value_counts(normalize=True) * 100

print("non-smokers {}".format(smoke_relfreq[0]))
print("smokers     {}".format(smoke_relfreq[1]))
print("ex-smokers  {}".format(smoke_relfreq[2]))

non-smokers 44.0
smokers     42.5
ex-smokers  13.5


## Pandas crosstab() function

In [49]:
# survey and group cross tab
pd.crosstab(df.Group, df.Survey)

Survey,1,2,3,4,5
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,17,32,13,14,24
1,21,18,17,23,21


# Measures of Central Tendency (point estimates)

## Mean() or Average. 
- What is the mean age of all patients?

In [52]:
avg_age = df.Age.mean()
print(" Avg age of all patients is {}".format(avg_age))

 Avg age of all patients is 55.07


- what is the average heart rate of all the patients?

In [53]:
avg_HR = df.HR.mean()
print(" Avg heart rate of all patients is {}".format(avg_HR))

 Avg heart rate of all patients is 74.655


- what is the mean age of patients who smoke (1 in column) ?

In [56]:
avg_smoker_age = df[df.Smoke ==1]['Age'].mean()
print(" Avg age of smoker patients is {:.2f}".format(avg_smoker_age))

 Avg age of smoker patients is 58.16


- what is the mean age of patients who do not smoke (0 in column) ?

In [57]:
avg_NONsmoker_age = df[df.Smoke ==0]['Age'].mean()
print(" Avg age of non-smoker patients is {:.2f}".format(avg_NONsmoker_age))

 Avg age of non-smoker patients is 52.09


- what is the mean age for patients who are ex-smokers (2 in column) ?

In [58]:
avg_Xsmoker_age = df[df.Smoke ==2]['Age'].mean()
print(" Avg age of ex-smoker patients is {:.2f}".format(avg_Xsmoker_age))

 Avg age of ex-smoker patients is 55.04


### Save time, use __groupby()__ method 

In [59]:
df.groupby('Smoke')['Age'].mean()

Smoke
0    52.090909
1    58.164706
2    55.037037
Name: Age, dtype: float64

## Median 

The **median** puts all the values in a sorted order.  
- If there are an odd number of values, then the median is the middle value.  
- If there are an even number of values, then the mean of the middle two values as taken.

- what is the median heart rate of patients older than 50?

In [64]:
median_HR_50plus = df[df.Age > 50]['HR'].median()
print("median HR for patients 50+ is {}".format(median_HR_50plus))

median HR for patients 50+ is 79.0


- what is the median age of patients who smoke (1) and have a heart rate greater than 70 ?

In [74]:
smokers_median = df.loc[
    (df.Smoke ==1) & ((df.HR > 70)),
    'Age'].median()

print("median age of patients who smoke and have HR >70 is {}".format(smokers_median))

median age of patients who smoke and have HR >70 is 60.0


## Mode (most occuring value)

- what is the mode of the smoking variable?

In [75]:
df.Smoke.mode()

0    0
dtype: int64

In [79]:
smoker_groups = df.Smoke.value_counts()

print("non-smokers count {}".format(smoker_groups[0]))
print("smokers count     {}".format(smoker_groups[1]))
print("ex-smokers count  {}".format(smoker_groups[2]))

non-smokers count 88
smokers count     85
ex-smokers count  27


# Standard Deviation & Variance

The **standard deviation** can be understood as the average difference between each continuous numerical data value and the mean of that variable.  Difference infers subtraction.

**Variances** are very useful in statistics.  We need to express the spread in the same units as our variable for it to make sense as a summary statistics. We take the square root of the variance to get the standard deviation, now expressed in the same units as the variable and a true measure of the average difference between all the values and the mean.

## using __std()__ method and __var()__ method with __groupby()__

- what is the standard deviation of the age of patients who smoke vs those who do not smoke?

In [83]:
smoker_std = df.groupby('Smoke')['Age'].std()

print("non-smokers age std {:.2f}".format(smoker_std[0]))
print("smokers age std     {:.2f}".format(smoker_std[1]))
print("ex-smokers age std  {:.2f}".format(smoker_std[2]))

non-smokers age std 12.03
smokers age std     12.36
ex-smokers age std  12.92


- what is the variance of the age of patients who smoke vs those who do not smoke?

In [86]:
smoker_variance = df.groupby('Smoke')['Age'].var()

print("non-smokers age variance {:.2f}".format(smoker_variance[0]))
print("smokers age variance     {:.2f}".format(smoker_variance[1]))
print("ex-smokers age variance  {:.2f}".format(smoker_variance[2]))

non-smokers age variance 144.61
smokers age variance     152.85
ex-smokers age variance  167.04


## Range

The **range** is the difference between the minimum and the maximum value of a continuous numerical variable.  The `min()` and the `max()` methods for series objects give these values.

- What is the minimum age of all the patients?
- what is the maximum age of all the patients?
- what is the age range of all the patients?

In [91]:
minAge = df.Age.min()
maxAge = df.Age.max()
print(" min age: {}".format(minAge))
print(" max age: {}".format(maxAge))
    
age_range = maxAge - minAge
print(" age range is: {}".format(age_range))

 min age: 32
 max age: 77
 age range is: 45


## Quantile | Quartiles 

The lowest of these three values (the **first quartile**), divide the data into two parts, with a quarter being lower than that value and three-quarters being higher.  The second divide the data values equally (the median or **second quartile**).  The third is a value that has three-quarters of the values less than and a quarter more than it (the **third quartile**).

### selecting quartiles for the __quantile()__ method

In [109]:
ageQuant = df.Age.quantile([0.25, 0.5, 0.75])
print("quartile Age")
print('{}'.format(ageQuant))

quartile Age
0.25    45.0
0.50    56.0
0.75    66.0
Name: Age, dtype: float64


- what is/ are the $95$<sup>th</sup> percentile values in age of patients who smoke vs non-smokers?

In [114]:
smoker95 = df.groupby('Smoke')['Age'].quantile(0.95)

print('non-smoker 95% : {}'.format(smoker95[0]))
print('smoker 95%     : {}'.format(smoker95[1]))
print('ex-smoker 95%  : {}'.format(smoker95[2]))

non-smoker 95% : 74.0
smoker 95%     : 75.0
ex-smoker 95%  : 73.7


## Interquartile range (IQR)          (3rd - 1st)

In [115]:
IQR = df.Age.quantile(0.75) - df.Age.quantile(0.25)
print(" the IQR is {}".format(IQR))

 the IQR is 21.0
