# Load packages, read-in pre-processed data

In [1]:
conda list

# packages in environment at /opt/anaconda3:
#
# Name                    Version                   Build  Channel
_anaconda_depends         2022.10                  py38_2  
_ipyw_jlab_nb_ext_conf    0.1.0            py38hecd8cb5_1  
affinegap                 1.12                     pypi_0    pypi
aiohttp                   3.8.4                    pypi_0    pypi
aiosignal                 1.3.1                    pypi_0    pypi
alabaster                 0.7.12             pyhd3eb1b0_0  
anaconda                  custom                   py38_1  
anaconda-client           1.11.2           py38hecd8cb5_0  
anaconda-navigator        2.3.0            py38hecd8cb5_0  
anyio                     3.5.0            py38hecd8cb5_0  
appdirs                   1.4.4              pyhd3eb1b0_0  
applaunchservices         0.3.0            py38hecd8cb5_0  
appnope                   0.1.2           py38hecd8cb5_1001  
appscript                 1.1.2            py38h9ed2024_0  
argon2-cf


Note: you may need to restart the kernel to use updated packages.


In [2]:
#Import relevant packages

import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt

from bokeh.io import show, curdoc, output_notebook
from bokeh.layouts import column
from bokeh.models import (
    ColumnDataSource,
    Label,
    LabelSet,
    CheckboxGroup,
    CustomJS,
    Button,
)
from bokeh.models.annotations import LabelSet
from bokeh.palettes import Category10
from bokeh.plotting import figure

import numpy as np

In [3]:
#  Read-in the pre-processed data

# df without urban-rural classification
df = pd.read_csv('/Users/loucap/Documents/GitWork/InteractiveGender/Data/cleaned_lang_SO.csv')

# Let's take a quick glance

df.head()

Unnamed: 0,LA_code,LA_name,SO_code,SO_categories,Observation_x,Percentages,Non-response_rate,region_x,Observation_y,Observation_total,Percentage,region_y,Shannon_idx
0,E06000001,Hartlepool,5,Not answered,4554,6.097528,6.097528,North East,1875,92337,2.030605,North East,0.931876
1,E06000002,Middlesbrough,5,Not answered,8298,7.282908,7.282908,North East,10510,143923,7.302516,North East,1.181237
2,E06000003,Redcar and Cleveland,5,Not answered,7046,6.27192,6.27192,North East,1460,136533,1.069339,North East,0.831589
3,E06000004,Stockton-on-Tees,5,Not answered,9268,5.865452,5.865452,North East,5674,196603,2.886019,North East,1.00165
4,E06000005,Darlington,5,Not answered,5010,5.686332,5.686332,North East,4403,107800,4.084416,North East,0.966864


In [3]:
# df with urban-rural classification
df2 = pd.read_csv('/Users/loucap/Documents/GitWork/InteractiveGender/Data/urban_rural_SO.csv')

# Let's take a quick glance
# IMPORTANT: we only have urb_rural classification for ENGLISH LA's
df2.head()

Unnamed: 0,LA_code,LA_name,SO_code,SO_categories,Observation_x,Percentages,Non-response_rate,region_x,Observation_y,Observation_total,Percentage,region_y,Urb_Rur
0,E06000001,Hartlepool,5,Not answered,4554,6.097528,6.097528,North East,1875,92337,2.030605,North East,Predominantly Urban
1,E06000002,Middlesbrough,5,Not answered,8298,7.282908,7.282908,North East,10510,143923,7.302516,North East,Predominantly Urban
2,E06000003,Redcar and Cleveland,5,Not answered,7046,6.27192,6.27192,North East,1460,136533,1.069339,North East,Urban with Significant Rural
3,E06000004,Stockton-on-Tees,5,Not answered,9268,5.865452,5.865452,North East,5674,196603,2.886019,North East,Predominantly Urban
4,E06000005,Darlington,5,Not answered,5010,5.686332,5.686332,North East,4403,107800,4.084416,North East,Predominantly Urban


# Interactive scatterplots

## Shows the relationship between the % of Non-English speakers and % of Non-response for our 331 Local Authorities in England and Wales.

### COLOURED BY REGION

In [4]:
df['Urb_Rur'] = df2['Urb_Rur']

In [5]:
df

Unnamed: 0,LA_code,LA_name,SO_code,SO_categories,Observation_x,Percentages,Non-response_rate,region_x,Observation_y,Observation_total,Percentage,region_y,Shannon_idx,Urb_Rur
0,E06000001,Hartlepool,5,Not answered,4554,6.097528,6.097528,North East,1875,92337,2.030605,North East,0.931876,Predominantly Urban
1,E06000002,Middlesbrough,5,Not answered,8298,7.282908,7.282908,North East,10510,143923,7.302516,North East,1.181237,Predominantly Urban
2,E06000003,Redcar and Cleveland,5,Not answered,7046,6.271920,6.271920,North East,1460,136533,1.069339,North East,0.831589,Urban with Significant Rural
3,E06000004,Stockton-on-Tees,5,Not answered,9268,5.865452,5.865452,North East,5674,196603,2.886019,North East,1.001650,Predominantly Urban
4,E06000005,Darlington,5,Not answered,5010,5.686332,5.686332,North East,4403,107800,4.084416,North East,0.966864,Predominantly Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,W06000020,Torfaen,5,Not answered,5233,6.966837,6.966837,Wales,1318,92273,1.428370,Wales,0.850490,Welsh/not specified
327,W06000021,Monmouthshire,5,Not answered,5423,6.931237,6.931237,Wales,1457,92955,1.567425,Wales,0.855804,Welsh/not specified
328,W06000022,Newport,5,Not answered,8721,6.844133,6.844133,Wales,10035,159590,6.287988,Wales,1.085068,Welsh/not specified
329,W06000023,Powys,5,Not answered,9832,8.736837,8.736837,Wales,2496,133173,1.874254,Wales,0.861874,Welsh/not specified


In [6]:
df.Urb_Rur.unique()

array(['Predominantly Urban', 'Urban with Significant Rural',
       'Predominantly Rural', 'Not specified', 'Welsh/not specified'],
      dtype=object)

In [7]:
import pandas as pd
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, Select
from bokeh.plotting import figure, curdoc
from bokeh.palettes import Category10
from bokeh.transform import factor_cmap
from bokeh.models import ColorBar, BasicTicker, PrintfTickFormatter, LogColorMapper


output_notebook()

# Prepare data sources
df['Urb_Rur'] = df['Urb_Rur'].astype(str)
source = ColumnDataSource(df)


# Define tooltips
tool = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("name", "@LA_name"),
]

p0 = figure(title = "Relationship between Non-response Rate and Non-English Speakers", x_axis_label = "Percentage of Non-English Speakers",
           y_axis_label = "Non-response rate", tooltips = tool)

p0.scatter("Percentage", "Non-response_rate", source=source, fill_alpha=0.5, size=10)

# Plot 1 (By Region)
p1 = figure(title="Relationship between Non-response Rate and Non-English Speakers",
            x_axis_label="Percentage of Non-English Speakers",
            y_axis_label="Non-response Rate",
            tooltips=tool)

for region, color in zip(df.region_x.unique(), Category10[10]):
    b = df[df.region_x == region]
    p1.circle(x='Percentage', y='Non-response_rate', size=10, alpha=0.5, color=color,
              legend_label=region, muted_color=color, muted_alpha=0.1, source=ColumnDataSource(b))

p1.legend.location = "bottom_right"
p1.legend.click_policy = "hide"
p1.legend.title = "Regions"

# Plot 2 (Urban vs Rural)
p_2 = figure(title="Relationship between Non-response Rate and Non-English Speakers",
            x_axis_label="Percentage of Non-English Speakers",
            y_axis_label="Non-response Rate",
            tooltips=tool)

urban_rural_sources = {}  # Create a dictionary to store the ColumnDataSource objects
for urb_rur in df.Urb_Rur.unique():
    urban_rural_sources[urb_rur] = ColumnDataSource(df[df.Urb_Rur == urb_rur])

for urb_rur, color in zip(df.Urb_Rur.unique(), Category10[10]):
    p_2.circle(x='Percentage', y='Non-response_rate', size=10, alpha=0.5, color=color,
              legend_label=urb_rur, muted_color=color, muted_alpha=0.1, source=urban_rural_sources[urb_rur])

p_2.legend.location = "bottom_right"
p_2.legend.click_policy = "hide"
p_2.legend.title = "Urban-Rural"



# Plot 3 (Shannon Index)
color_map = LogColorMapper(palette="Viridis256", low=df.Shannon_idx.min(), high=df.Shannon_idx.max())

p3 = figure(title="Relationship between Non-response Rate and Non-English Speakers",
            x_axis_label="Non-response Rate",
            y_axis_label="Percentage of Non-English Speakers",
            tooltips=tool)

p3.scatter("Percentage", "Non-response_rate", source=source, fill_alpha=0.5, size=10,
           color={'field': 'Shannon_idx', 'transform': color_map})

color_bar = ColorBar(color_mapper=color_map,
                     title='Shannon Index',
                     ticker=BasicTicker(desired_num_ticks=5),
                     formatter=PrintfTickFormatter(format='%.2f'))

p3.add_layout(color_bar, 'right')

# Dropdown menu
dropdown = Select(title="Color By:", value="None", options=["Default", "Region", "Urban", "Shannon Index"])

# Define the update function
def update_scatterplots(attr, old, new):
    if dropdown.value == "Default":
        p0.visible = True
        p1.visible = False
        p_2.visible = False
        p3.visible = False
    elif dropdown.value == "Region":
        p0.visible = False
        p1.visible = True
        p_2.visible = False
        p3.visible = False
    elif dropdown.value == "Urban":
        p0.visible = False
        p1.visible = False
        p_2.visible = True
        p3.visible = False
    elif dropdown.value == "Shannon Index":
        p0.visible = False
        p1.visible = False
        p_2.visible = False
        p3.visible = True

# Set initial visibility
p0.visible = True
p1.visible = False
p_2.visible = False
p3.visible = False

# Add the callback to the dropdown menu
dropdown.on_change('value', update_scatterplots)

# Create a layout with the dropdown menu and the scatterplots
layout = column(dropdown, p0, p1, p_2, p3)

# Add the layout to the document
curdoc().add_root(layout)

# show(p2)

In [16]:
# Read-in pre-processed data for religion

rel = pd.read_csv('/Users/loucap/Documents/GitWork/InteractiveGender/Data/cleaned_religion_SO.csv')

In [17]:
rel

Unnamed: 0,LA_code,LA_name,SO_code,SO_categories,Religion_code,Religion_categories,Observation,Group_Percentages_No religion,Total_LA_counts,No religion_%,...,Observation_Hindu,Group_Percentages_Hindu,Total_LA_counts_Hindu,Hindu_%,Religion_code_Sikh,Religion_categories_Sikh,Observation_Sikh,Group_Percentages_Sikh,Total_LA_counts_Sikh,Sikh_%
0,E06000001,Hartlepool,5,Not answered,1,No religion,1139,1.61,70898,38.76,...,23,0.03,70898,0.23,7,Sikh,15,0.02,70898,0.19
1,E06000002,Middlesbrough,5,Not answered,1,No religion,1751,1.63,107747,35.98,...,86,0.08,107747,1.20,7,Sikh,33,0.03,107747,0.41
2,E06000003,Redcar and Cleveland,5,Not answered,1,No religion,1824,1.71,106453,38.58,...,9,0.01,106453,0.10,7,Sikh,4,0.00,106453,0.04
3,E06000004,Stockton-on-Tees,5,Not answered,1,No religion,2249,1.50,150094,38.48,...,26,0.02,150094,0.41,7,Sikh,25,0.02,150094,0.40
4,E06000005,Darlington,5,Not answered,1,No religion,1256,1.50,83761,39.32,...,21,0.03,83761,0.41,7,Sikh,30,0.04,83761,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,W06000020,Torfaen,5,Not answered,1,No religion,1495,2.12,70598,50.32,...,12,0.02,70598,0.32,7,Sikh,7,0.01,70598,0.06
327,W06000021,Monmouthshire,5,Not answered,1,No religion,1458,1.98,73676,43.34,...,14,0.02,73676,0.22,7,Sikh,6,0.01,73676,0.11
328,W06000022,Newport,5,Not answered,1,No religion,2386,1.98,120275,43.65,...,42,0.03,120275,0.48,7,Sikh,11,0.01,120275,0.27
329,W06000023,Powys,5,Not answered,1,No religion,2541,2.43,104686,42.47,...,25,0.02,104686,0.23,7,Sikh,3,0.00,104686,0.05


In [18]:
rel.columns

Index(['LA_code', 'LA_name', 'SO_code', 'SO_categories', 'Religion_code',
       'Religion_categories', 'Observation', 'Group_Percentages_No religion',
       'Total_LA_counts', 'No religion_%', 'Religion_code_Christian',
       'Religion_categories_Christian', 'Observation_Christian',
       'Group_Percentages_Christian', 'Total_LA_counts_Christian',
       'Christian_%', 'Religion_code_Muslim', 'Religion_categories_Muslim',
       'Observation_Muslim', 'Group_Percentages_Muslim',
       'Total_LA_counts_Muslim', 'Muslim_%', 'Religion_code_Other',
       'Religion_categories_Other', 'Observation_Other',
       'Group_Percentages_Other', 'Total_LA_counts_Other', 'Other_%',
       'Religion_code_Buddhist', 'Religion_categories_Buddhist',
       'Observation_Buddhist', 'Group_Percentages_Buddhist',
       'Total_LA_counts_Buddhist', 'Buddhist_%', 'Religion_code_Jewish',
       'Religion_categories_Jewish', 'Observation_Jewish',
       'Group_Percentages_Jewish', 'Total_LA_counts_Jewish',

In [19]:
# Read-in totals and non-response by religion

totals = pd.read_csv('/Users/loucap/Documents/GitWork/InteractiveGender/Data/gen_totals_SO.csv')

In [20]:
totals.head()

Unnamed: 0,Religion_categories,Observation,Percent_of_survey_respondents
0,Buddhist,245514,0.54
1,Christian,23656564,51.78
2,Hindu,823132,1.8
3,Jewish,213617,0.47
4,Muslim,2664709,5.83


In [21]:
totals = totals.sort_values(by = "Percent_of_survey_respondents", ascending = False)

In [22]:
# Read-in Non-response table

nr_totals = pd.read_csv('/Users/loucap/Documents/GitWork/InteractiveGender/Data/nr_totals_SO.csv')

nr_totals.head()

Unnamed: 0,Religion_categories,Observation,Non_response_rate,Contribution_to_overall_non_response_rate
0,Buddhist,18331,7.47,0.82
1,Christian,1054454,4.46,47.25
2,Hindu,61226,7.44,2.74
3,Jewish,18054,8.45,0.81
4,Muslim,217981,8.18,9.77


In [23]:
nr_totals = nr_totals.sort_values(by = "Contribution_to_overall_non_response_rate", ascending = False)

In [24]:
import pandas as pd
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, Select, HTMLTemplateFormatter
from bokeh.models.widgets import DataTable, TableColumn, Div
from bokeh.plotting import figure, show, curdoc
from bokeh.io import output_notebook

# Custom cell formatter
template = """
<% if (Religion_categories == selected_religion) { %>
    <span style="color: red; font-weight: bold"><%= value %></span>
<% } else { %>
    <span style="font-weight: bold"><%= value %></span>
<% } %>
"""

def create_formatter(selected_religion):
    formatter = HTMLTemplateFormatter(template=template.replace("selected_religion", f"'{selected_religion}'"))
    return formatter

# Create DataTable for layout1
source1 = ColumnDataSource(totals)

columns1 = [
    TableColumn(field="Religion_categories", title="Religion", formatter=create_formatter('Christian')),
    TableColumn(field="Observation", title="Observation", formatter=create_formatter('Christian')),
    TableColumn(field="Percent_of_survey_respondents", title="% of respondents", formatter=create_formatter('Christian')),
]

heading1 = Div(text="<h1>Totals</h1>", width=300)

data_table1 = DataTable(source=source1, columns=columns1, editable=False, width=500, index_position=None)

layout1 = column(heading1, data_table1)

# Create DataTable for layout2
source2 = ColumnDataSource(nr_totals)

columns2 = [
    TableColumn(field="Religion_categories", title="Religion", formatter=create_formatter('Christian')),
    TableColumn(field="Observation", title="Observation", formatter=create_formatter('Christian')),
    TableColumn(field="Non_response_rate", title="Non response rate", formatter=create_formatter('Christian')),
    TableColumn(field="Contribution_to_overall_non_response_rate", title="% of total Non-response rate", formatter=create_formatter('Christian')),
]

heading2 = Div(text="<h1>Non-response rates</h1>", width=300)

data_table2 = DataTable(source=source2, columns=columns2, editable=False, width=700, index_position=None)

layout2 = column(heading2, data_table2)

# Scatter plot
output_notebook()

# Prepare data
rel['selected_religion'] = rel['Christian_%']  # Default religion
rel['selected_percentages'] = rel['Group_Percentages_Christian']

source = ColumnDataSource(rel)

# Define tooltips
tool = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("name", "@LA_name"),
]

# Create figure
p4 = figure(title="Relationship between % of religious group in given LA, and their non-response rate",
            y_axis_label="Non-response Rate", x_axis_label="Percentage of religious group in given LA", tooltips=tool)

# Scatter plot
p4.scatter("selected_religion", "selected_percentages", source=source, fill_alpha=0.5, size=10)

def update_highlighted_rows(selected_religion):
    formatter = create_formatter(selected_religion)
    for col in columns1:
        col.formatter = formatter
    for col in columns2:
        col.formatter = formatter
    data_table1.columns = columns1
    data_table2.columns = columns2

# Define callback for updating data source

def update_plot(attr, old, new):
    selected_religion = select_religion.value
    rel['selected_religion'] = rel[f'{selected_religion}_%']
    rel['selected_percentages'] = rel[f'Group_Percentages_{selected_religion}']
    source.data = source.from_df(rel)
    update_highlighted_rows(selected_religion)

# Create select widget
options = ['Christian', 'No religion', 'Muslim', 'Jewish', 'Buddhist', 'Hindu', 'Sikh', 'Other']
select_religion = Select(title="Religious Group:", value='Christian', options=options)
select_religion.on_change('value', update_plot)

# Initial update of the highlighted rows
update_highlighted_rows(select_religion.value)

# Layout
layout = column(select_religion, p4)
l = row(layout1, layout2)

# Show plot
curdoc().add_root(column(layout, l))