# UK Data Exploration

Initial UK Data exploration will give us a better idea of the structure of data and how can we work with them.

In [None]:
import sqlalchemy
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

uri = 'mysql://uk-project:rchi2019@localhost/uk-data'

### Fetching prescription data summary from the database

In [None]:
sql = 'SELECT a.*, bc.name FROM (SELECT SUM(rp.items) AS total_items, SUM(rp.quantity) AS total_quantity, rp.bnf_code_9, rp.period from rx_prescribed rp GROUP BY bnf_code_9, period) a LEFT JOIN bnf_code_9 bc ON a.bnf_code_9=bc.bnf_code_9 order by total_items desc'
df = pd.read_sql(sql,uri)

In [None]:
sql = 'select * from bnf_code_9'
bnf_code_df = pd.read_sql(sql,uri)
bnf_code_df.head()

### Visually inspect the dataframe

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info(memory_usage='deep')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

### Save the dataframe to CSV for faster data load

In [None]:
df.to_csv('C:/Users/jbutl20/Desktop/rx-summary.csv', index=False)

In [None]:
# Run this to avoid long SQL query above (After saving new updated query)

df = pd.read_csv('C:/Users/jbutl20/Desktop/rx-summary.csv')
df.head()

In [None]:
top_150 = df.loc[:150,:]

In [None]:
top_150['total_items'] = top_150['total_items'].apply(np.log10)
top_150.head(10)

In [None]:
wide_df = df.pivot(index='bnf_code_9', columns='period', values='total_items')
wide_df_labeled = wide_df.join(bnf_code_df.set_index('bnf_code_9'), on='bnf_code_9')
wide_df.head()

In [None]:
normalize_df =  (wide_df - wide_df.mean()) / (wide_df.max() - wide_df.min())
normalize_df.head()

In [None]:
log_df = wide_df.apply(np.log10)
log_df = wide_df_labeled.apply(np.log10)
log_df.head()

In [None]:
print(np.log(10))

In [None]:
labeled_df = wide_df.join(bnf_code_df.set_index('bnf_code_9'), on='bnf_code_9')
tick_labels = labeled_df['name']
plt.figure(figsize=(8,50))
sns.heatmap(wide_df[:200], cmap='tab20c', linecolor='black', linewidth=0.3, yticklabels=tick_labels[:200])