
## Data Preparation

In [None]:
# Standard packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Machine Learning tools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

uri = 'mysql://uk-project:rchi2019@localhost/uk-data'
path = 'C:/Users/jbutl20/Desktop/'
month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
year = '2018'

In [None]:
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

# Helper Function

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

# Import the data

In [None]:
sql = "select a.name, b.* from rx_by_practice_{} b left join bnf_code_9 a on a.bnf_code_9=b.bnf_code_9".format(year)
df = pd.read_sql(sql, uri)

sql = "select org_code, name, nat_group, hlhg, post_code, practice_setting_id, num_practitioners from practice"
practice_df = pd.read_sql(sql, uri)

sql = "select org_code, num_practitioners from practice"
num_practitioners_df = pd.read_sql(sql, uri)

In [None]:
df.describe()

In [None]:
df.head()

# Reshape Dataframe to Wide Format

In [None]:
wide_df = df.pivot_table(index='practice', columns='name', values='total_items', aggfunc=np.sum)
wide_df.to_csv(os.path.join(path,r'rx_by_practice_wide.csv'))
wide_df.head()

In [None]:
wide_df.info()

In [None]:
#tmp_df = num_practitioners_df.set_index('org_code').join(wide_df, how='inner')
final_df = practice_df.join(wide_df, how='inner', on='org_code')
final_df.fillna(0, inplace=True)
final_df.head()

In [None]:
final_df.to_csv(os.path.join(path, r'rx-by-practice-{}.csv'.format(year)))

# Encode Text Labels

In [None]:
encoder = MultiColumnLabelEncoder(columns=['nat_group', 'hlhg', 'post_code'])
encoded_df = encoder.fit_transform(final_df)
encoded_df.head()

In [None]:
numerical_df = encoded_df.drop(encoded_df.columns[0:2], axis=1)
numerical_df.head()

In [None]:
scaler = Normalizer()
scaled_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns=numerical_df.columns)
scaled_df.head()

# Set up pipeline

In [None]:
pipe = Pipeline([
    ('scaler', Normalizer()),
    ('reducer', PCA(n_components=0.90)),
    ('tsne', TSNE(learning_rate=70, random_state=17, perplexity=95))
])

# Execute the pipeline and display t-SNE graph

In [None]:
pc = pipe.fit_transform(numerical_df)

In [None]:
final_df['practice_setting'] = final_df['practice_setting_id'].apply(lambda x: 'Setting {}'.format(x))

In [None]:
final_df['PC 1'] = pc[:,0]
final_df['PC 2'] = pc[:,1]

fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(data=final_df, x='PC 1', y='PC 2', hue='practice_setting', size='num_practitioners', alpha=0.4, legend='brief', ax=ax, s=5)

In [None]:
g = sns.jointplot(x="PC 1", y="PC 2", data=final_df, kind="hex", height=10)

In [None]:
var = pipe.steps[1][1].explained_variance_ratio_

In [None]:
plt.plot(var.cumsum())
plt.xlabel('Principle component index')
plt.ylabel('Explained variance ratio')
plt.show()

In [None]:
print(var.cumsum())

# K-Means clustering

In [None]:
cluster = KMeans(n_clusters=8, random_state=31)
#numerical_df['X'] = pc[:,0]
#numerical_df['Y'] = pc[:,1]
kmeans = cluster.fit(numerical_df)
final_df['label'] = pd.DataFrame(cluster.fit_predict(numerical_df))

In [None]:
pipe.steps[1][1].value

In [None]:
final_df['label'] = final_df['label'].apply(lambda x: "Cluster {}".format(x))
final_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(data=final_df, x='PC 1', y='PC 2', hue='label')