# Clinical Data Visualization - Plotly

In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = 'plotly_dark'
import plotly.express as px

In [2]:
df = pd.read_csv('./DATA/DATA copy.csv')
df.head()

Unnamed: 0,LastName,FirstName,DOB,Age,Vocation,Smoke,HR,sBP,CholesterolBefore,Cholesterol_b4_level,CholesterolAfter,TAG,Survey,Delta,Group,year
0,Patton,Dylan,1981-10-07,45,Energy manager,0,47,145,1.2,low,0.7,1.2,1,0.5,1,1981-10-07
1,Howard,Sandra,1993-01-27,55,Tax adviser,0,51,115,1.2,low,1.0,0.6,3,0.2,1,1993-01-27
2,Williams,Samantha,1973-12-21,35,IT consultant,0,54,120,2.0,low,1.7,1.3,3,0.3,1,1973-12-21
3,Hensley,Ashley,1981-12-01,45,"Nurse, children's",0,54,103,2.1,low,2.1,1.6,4,0.0,1,1981-12-01
4,Wilson,Robert,1964-06-23,48,Clinical embryologist,0,61,138,2.8,low,2.8,2.1,5,0.0,1,1964-06-23


# Barplot for frequency counts

- create a barplot for number of patients for each smoke group (0= non-smoker, 1= smoker, 2= ex-smoker)

In [3]:
df.Smoke.value_counts()

0    88
1    85
2    27
Name: Smoke, dtype: int64

### use a list for y values instead of manually input

In [9]:
# list of smoker group count values 
y = df.Smoke.value_counts().to_list()
y

[88, 85, 27]

In [13]:
# simple barplot

smokers = go.Figure()

smokers.add_trace(go.Bar(
    x= ['Non-smokers','Smokers','Ex-smokers'],
    # old way: y= [88, 85, 27] # smoker count values
    y= df.Smoke.value_counts().to_list(),
    marker= {'color': ['green','red','orange']}
))
# Add a title
smokers.update_layout(title='Number of non-smokers vs smokers vs ex-smokers')

# Add axes labels
smokers.update_layout(xaxis=dict(title='Groups of smokers'),
                          yaxis=dict(title='Counts'))


smokers.show()

In [14]:
df.Smoke.value_counts(normalize=True) * 100  # Calculative percentage relative frequency

0    44.0
1    42.5
2    13.5
Name: Smoke, dtype: float64

In [21]:
smokers = go.Figure()

smokers.add_trace(go.Bar(
    x=['Non-smokers', 'Smokers', 'Ex-smokers'],
    y=df.Smoke.value_counts().values.tolist(),
    text=df.Smoke.value_counts().values.tolist(),
    textposition='outside',
    hovertext=['44% of patients are non-smokers', '42.5% patients are smokers', '13.5% of patients are ex-smokers'],
    marker={'color':['green', 'rgba(255, 0, 0, 1)', 'orange'],
            'line':{'color':'black', 'width':1},
            #'opacity':0.7
           },
   
))

smokers.update_layout(title='Number of non-smokers vs smokers vs ex-smokers')

smokers.update_layout(xaxis = dict(title='Groups of smokers'),
                          xaxis_tickangle=-25,
                          yaxis=dict(title='Counts'))

smokers.show()

In [23]:
pd.crosstab(df.Group, df.Survey)

Survey,1,2,3,4,5
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,17,32,13,14,24
1,21,18,17,23,21


In [24]:
surv_grp_fig = go.Figure()

surv_grp_fig.add_trace(go.Bar(
    x=['1', '2', '3', '4', '5'],
    y=[21, 18, 17, 23, 21],
    text=[21, 18, 17, 23, 21],
    textposition='outside',
    name='Active group',
    marker={'color':'orange', 'opacity':0.7}
))

surv_grp_fig.add_trace(go.Bar(
    x=['1', '2', '3', '4', '5'],
    y=[17, 32, 13, 14, 24],
    text=[17, 32, 13, 14, 24],
    textposition='outside',
    name='Control group',
    marker={'color':'deepskyblue', 'opacity':0.7}
))

surv_grp_fig.update_layout(title='Survey frequencies by treatment group')

surv_grp_fig.update_layout(xaxis = dict(title='Survey answer'),
                           yaxis=dict(title='Counts'),
                           barmode='group')

surv_grp_fig.show()

# Histograms for value counts of numerical variables distrbution

In [30]:
age_hist = px.histogram(df,
                        x='Age',
                        #color='Group',
                       )
age_hist.show()

In [31]:
ages_smoke = go.Figure()

ages_smoke.add_trace(go.Histogram(
    x=df[df.Smoke == 0]['Age'],
    name='non-smokers',
    marker_color='orange',
    xbins=dict(start=10,
               end=90,
               size=5)
))
ages_smoke.add_trace(go.Histogram(
    x=df[df.Smoke == 1]['Age'],
    name='smokers',
    marker_color='deepskyblue',
    xbins=dict(start=10,
               end=90,
               size=5)
))

ages_smoke.update_layout(barmode='overlay',
                         title='Age distribution of non-smokers and smokers',
                         xaxis=dict(title='Age'),
                         yaxis=dict(title='Count'))

ages_smoke.update_traces(opacity=0.75)

ages_smoke.show()

In [38]:
# Simple box plots using express
ages_smoke_box_px = px.box(
    df,
    x='Smoke',
    y='Age',
    title='Distribution of age in non-smokers and smokers')


ages_smoke_box_px.show()

In [39]:
# Extracting list objects
non_smoker_age = df[df.Smoke == 0]['Age'].to_list()
smoker_age = df[df.Smoke == 1]['Age'].to_list()
ex_smoker_age = df[df.Smoke == 2]['Age'].to_list()


In [40]:
# Adding separate traces and configuration
ages_smoke_box = go.Figure()

ages_smoke_box.add_trace(go.Box(
    y=non_smoker_age,
    name='non-smokers',
    marker_color='green',
    boxmean=True,
    boxpoints='all'
))

ages_smoke_box.add_trace(go.Box(
    y=smoker_age,
    name='smokers',
    marker_color='red',
    boxmean='sd',
    boxpoints='all'
))

ages_smoke_box.add_trace(go.Box(
    y=ex_smoker_age,
    name='ex-smokers',
    marker_color='orange',
    boxmean='sd',
    boxpoints='all'
))

ages_smoke_box.update_layout(title='Distribution of ages',
                             xaxis={'title':'Group'},
                             yaxis={'title':'Count'})

ages_smoke_box.show()

# Scatterplots for differences between numerical variables

In [47]:
# x axis = Age   y axis = systolic blood pressure

age_sbp = go.Figure()

age_sbp.add_trace(go.Scatter(
    x=df.Age,
    y=df.sBP,
    mode='markers', 

))

age_sbp.update_layout(title="Age vs systolic blood pressure",
                      xaxis=dict(title="Age"),
                      yaxis=dict(title="Systolic blood pressurer"))

age_sbp.show()

In [52]:
import statsmodels

# Box and whisker plots of the variables
age_sbp_group_px = px.scatter(
    df,
    x='Age',
    y='sBP',
    size='HR',  # Determines size of markers
    color='Group',  # Group by this variable
    marginal_y='box',
    marginal_x='box',
    trendline='ols',
    title='Comparing age vs systolic BP for each of the two groups',
    labels={'sBP':'Systolic BP'}
)  # Over-write column names
age_sbp_group_px.show()

In [53]:
age_sbp_group_px_facet = px.scatter(
    df,
    x='Age',
    y='sBP',
    color='HR',
    facet_col='Group',
    trendline='ols',
    title='Sperate scatter plots per group',
    labels={'sBP':'Systolic BP'},
    color_continuous_scale=px.colors.sequential.Viridis)
age_sbp_group_px_facet.show();