In [None]:
import sqlalchemy
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import folium
import os
import squarify

uri = 'mysql://uk-project:rchi2019@localhost/uk-data'
path = 'C:/Users/jbutl20/Desktop/'
month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [None]:
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

# Helper Functions

In [None]:
 def treemap (df, maxnum):
    labels = df.index
    labels = labels[:maxnum]
    labels = df.reset_index('name').apply(lambda x: str(x['name']) + "\n (" + str('{0:,.2f}'.format(x['Total'])) + ")", axis=1)
    sizes = df['Total'].values.tolist()
    sizes = sizes[:maxnum]
    colors = [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]

    # Draw Plot
    plt.figure(figsize=(15,10), dpi= 80)
    squarify.plot(sizes=sizes, label=labels, color=colors, alpha=.8)

    # Decorate
    plt.axis('off')
    plt.show()

# Load data

In [None]:
sql = "select * from top_10_by_practice"
df = pd.read_sql(sql, uri)

sql = "select * from bnf_code_9"
bnf_code_df = pd.read_sql(sql, uri)

sql = "select a.name, b.* from total_rx_by_month b left join bnf_code_9 a on a.bnf_code_9=b.bnf_code_9"
total_rx_df = pd.read_sql(sql, uri)

In [None]:
df_labeled = df.join(bnf_code_df.set_index('bnf_code_9'), on='bnf_code_9')
df_labeled.head()

### Save dataframe to CSV

In [None]:
df_labeled.to_csv(os.path.join(path, r'top-10-rx.csv'), index=False)
total_rx_df.to_csv(os.path.join(path, r'total-rx-by-month.csv'), index=False)

# Visually inspect two different dataframe

## Top 10 Drugs prescribed by practice

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(exclude='number')

## Monthly Total by Drug

In [None]:
total_rx_df.info()

In [None]:
total_rx_df.describe()

In [None]:
total_rx_df.describe(exclude='number')

### Top 20 Highest Prescribed Drug per Practice (Total items / Number of Practice)

In [None]:
total_rx_df.sort_values(by='items_per_practice', ascending=False).head(20)

### Reshape monthly total rx prescribed to wide data format

In [None]:
total_rx_prac_wide_df = total_rx_df.pivot_table(index='name', columns='period', values='num_practice', margins=True, margins_name='Total', aggfunc=np.sum)
total_rx_prac_wide_df.to_csv(os.path.join(path,r'total_rx_by_month_wide.csv'))
total_rx_prac_wide_df.head(10)

In [None]:
total_rx_ipp_wide_df = total_rx_df.pivot_table(index='name', columns='period', values='items_per_practice', margins=True, margins_name='Total', aggfunc=np.mean)
total_rx_ipp_wide_df.to_csv(os.path.join(path,r'total_rx_by_month_wide.csv'))
total_rx_ipp_wide_df.head(10)

In [None]:
total_rx_items_wide_df = total_rx_df.pivot_table(index='name', columns='period', values='total_items', margins=True, margins_name='Total', aggfunc=np.sum)
total_rx_items_wide_df.to_csv(os.path.join(path,r'total_rx_by_month_wide.csv'))
total_rx_items_wide_df.head(10)

### Filter the above to keep top 100 drugs 

In [None]:
tmp = total_rx_items_wide_df.drop(index=['Total'])
top100_rx_items = tmp.sort_values(by='Total', ascending=False).head(100)

tmp = total_rx_ipp_wide_df.drop(index=['Total'])
top100_rx_ipp = tmp.sort_values(by='Total', ascending=False).head(100)

tmp = total_rx_prac_wide_df.drop(index=['Total'])
top100_rx_prac = tmp.sort_values(by='Total', ascending=False).head(100)

# Treemap

### Top 30 Most Prescribed Drugs

In [None]:
treemap(top100_rx_items, 30)

### Top 30 Most Times Drugs were prescribed per Practice (Average)

In [None]:
treemap(top100_rx_ipp, 30)

# Heatmap

### Heatmap of Top 100 Drugs Prescribed during 2017-2018

In [None]:
plt.figure(figsize=(8,25))
colors = [plt.cm.Spectral(i/float(20)) for i in range(20)]
sns.heatmap(top100_rx_items.drop(columns=['Total']).sort_values(by='name'), cmap=colors, linecolor='black', linewidth=0.3, xticklabels=month_labels)

### Heatmap of Top 100 Drug Prescribed per Practice (total_items / num_of_practice)

In [None]:
plt.figure(figsize=(8,25))
sns.heatmap(top100_rx_ipp.drop(columns=['Total']).sort_values(by='name'), cmap=colors, linecolor='black', linewidth=0.3, xticklabels=month_labels)

### Heatmap of Top 100 Prescribing Practices by Drug

In [None]:
plt.figure(figsize=(8,25))
sns.heatmap(top100_rx_prac.drop(columns=['Total']).sort_values(by='name'), cmap=colors, linecolor='black', linewidth=0.3, xticklabels=month_labels)

## Histogram of Most Common Top 10 Prescribed Drugs

X-axis represents practice rank, Y-axis represents frequency.

In [None]:
sliced_df = df_labeled[df_labeled.total_items > 100]
sliced_df[['practice_rank','name']].hist(by=sliced_df['name'], figsize=(20,160), rot=30, layout=(82,8), sharex=True, bins=(1,2,3,4,5,6,7,8,9,10))
#df_labeled[['practice_rank','name']].hist(by=df_labeled['name'], figsize=(20,160), rot=30, layout=(82,8), bins=(1,2,3,4,5,6,7,8,9,10))
print()

### Choropleth

In [None]:
ccg_gdf = gpd.read_file('uk_ccg.geojson')
ccg_gdf.head(3)

In [None]:
new_gdf = gpd.read_file('new_uk_ccg_2018.geojson')
new_gdf.head()

In [None]:
new_gdf.info()

In [None]:
fig, ax = plt.subplots(figsize=(20,30))
new_gdf.plot(ax=ax, edgecolor='black')

In [None]:
uk_centroid = [53.8060835,-1.6057716]
m = folium.Map(uk_centroid, zoom_start=6)
folium.GeoJson('new_uk_ccg_2018.geojson', name='geojson').add_to(m)
m