### LSE Data Analytics Online Career Accelerator

# Course 2: Data Analytics using Python


# 

# Assignment activity 1

In [None]:
# My GitHub repository.
# https://github.com/Tamagh

# Assignment activity 2

In [None]:
# Import the Pandas and DateTime libraries.
import pandas as pd
import datetime
import seaborn as sns

# Load the data set.
ad = pd.read_csv('actual_duration.csv')
ar = pd.read_csv('appointments_regional.csv')
nc = pd.read_excel('national_categories.xlsx')

In [None]:
# Set up matplotlib.
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.size"] = 18
plt.plot([1,2,3], label = 'label')
plt.xlabel('xaxis')
plt.ylabel('yaxis')
plt.legend()
plt.xticks()
plt.yticks()
plt.title('PLOT')
plt.gcf().set_tight_layout(True) # To prevent the xlabel being cut off
plt.show()

### 1. Explore the actual duration data.

In [None]:
# View and validate the 'ad' dataframe.
print(ad.shape)
print(ad.dtypes)
ad.head()

In [None]:
#View descriptive statistics
ad.describe()

In [None]:
# Check for missing data in all files.
ad_na = ad[ad.isna().any(axis=1)]
ar_na = ar[ar.isna().any(axis=1)]
nc_na = nc[nc.isna().any(axis=1)]

print(ad_na.shape)
print(ar_na.shape)
print(nc_na.shape)

In [None]:
# Find out the number of locations based on the 'ad' dataframe.
ad_locations = ad['sub_icb_location_name'].value_counts()

print(ad_locations.shape)

In [None]:
# Find out the number of sub location codes.
print(ad['sub_icb_location_code'].value_counts())

In [None]:
# Find out the number of ons locations.
ad_locations_ons = ad['icb_ons_code'].value_counts()
print(ad_locations_ons.shape)

In [None]:
# Check with value count which 5 location has the highest and the lowest number of records.
print(ad['sub_icb_location_name'].value_counts())

In [None]:
# Check the number of records.
ad['count_of_appointments'].count()

In [None]:
## Instead of working with the number of records, explore number of appointments.
# Calculate the number of appointments.
ad['count_of_appointments'].sum()

In [None]:
print(ad['count_of_appointments'].min())
print(ad['count_of_appointments'].max())

In [None]:
# Which location had the highest number of appointments?
group_sub_location = ad.groupby('sub_icb_location_ons_code')[['count_of_appointments']].agg('sum')
group_sub_location.sort_values(by=['count_of_appointments'], inplace=True, ascending=False)
group_sub_location

In [None]:
# List the top 10 location names by appointments.
group_sub_location_name = ad.groupby('sub_icb_location_name')[['count_of_appointments']].agg('sum')
group_sub_location_name.sort_values(by=['count_of_appointments'], inplace=True, ascending=False)
group_sub_location_name

In [None]:
# Analyse locations.
group_sub_location_name.describe()

In [None]:
# Show distribution of locations.
sns.displot(data=group_sub_location_name, x='count_of_appointments')
plt.savefig('location_distribution.png')

In [None]:
# Group the data based on duration.
print(ad['actual_duration'].value_counts())

In [None]:
# Sort actual duration by the number of appointments.
group_actual_duration = ad.groupby('actual_duration')[['count_of_appointments']].agg('sum')
group_actual_duration.reset_index(inplace=True)
group_actual_duration.sort_values(by=['count_of_appointments'], inplace=True, ascending=False)
group_actual_duration

In [None]:
# What is the percentage of Unknown data?
print(40284086/167980692)

In [None]:
# Analyse the dataframe.
group_actual_duration.info()

In [None]:
# Show count of appointments in actual duration categories.
sns.barplot(y='actual_duration',
            x='count_of_appointments',
            data=group_actual_duration,
            palette='light:b_r')
plt.savefig('aduration_categories2.png')

In [None]:
# Change the datatype for the appointment date column.
ad['appointment_date'] = pd.to_datetime(ad['appointment_date']).dt.date

In [None]:
# Find the earliest and latest records.
print(ad['appointment_date'].min())
print(ad['appointment_date'].max())

In [None]:
# Find the highest and lowest number of records by date.
print(ad['appointment_date'].value_counts())

### 2. Explore the appointment regional data.

In [None]:
# View and validate the ar dataframe.
print(ar.shape)
print(ar.dtypes)
ar.head()

In [None]:
# Find out the number of ons locations.
ar_locations_ons = ar['icb_ons_code'].value_counts()

print(ad_locations_ons.shape)

In [None]:
# Calculate the number of appointments
ar['count_of_appointments'].sum()

In [None]:
# Sense check the data.
ar.describe()

In [None]:
# Sort the dataframe by count of appointments.
sorted_ar = ar.sort_values(by=['count_of_appointments'], ascending=False)
sorted_ar

In [None]:
# Group records by month.
ar_month = (ar['appointment_month'].value_counts(sort=False))
ar_month

In [None]:
# Group the data based on appointment status.
ar_status = (ar['appointment_status'].value_counts())
ar_status

In [None]:
# Count appointment status types.
print('Number of appointment statuses:', (ar_status.count()))

In [None]:
# Group number of records.
ar_monthly = ar.groupby('appointment_month')[['appointment_status']].agg('value_counts')
ar_monthly[0:20]

In [None]:
# Group the data by month, then status based on count of appointments.
ar_monthly_count = ar.groupby(['appointment_month', 'appointment_status'])[['count_of_appointments']].agg('sum')
ar_monthly_count.reset_index(inplace=True)
ar_monthly_count

In [None]:
# Create a lineplot to display appointment status over time.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             data=ar_monthly_count,
             hue='appointment_status')

In [None]:
# Pivot the dataframe to show values of appointment status monthly.
ar_pivot = pd.pivot_table(ar_monthly_count, 
                          values = 'count_of_appointments', 
                          index='appointment_month', 
                          columns = 'appointment_status')
ar_pivot

In [None]:
# Calculate total monthly and attendance ratio columns.
ar_pivot['total_appointments'] = ar_pivot['Attended'] + ar_pivot['DNA'] + ar_pivot['Unknown']
ar_pivot['attendance_ratio'] = ar_pivot['Attended']/ar_pivot['total_appointments']
ar_pivot

In [None]:
# Sense check pivot data.
ar_monthly_apps = ar.groupby('appointment_month')[['count_of_appointments']].agg('sum')
ar_monthly_apps.reset_index(inplace=True)
ar_monthly_apps.loc['total'] = ar_monthly_apps.sum()
ar_monthly_apps

In [None]:
ax = sns.lineplot(x='appointment_month', y='attendance_ratio', data=ar_pivot, color='orange', label='attendance')
ax2 = ax.twinx()
ax2 = sns.lineplot(x='appointment_month', y='total_appointments', data=ar_pivot, color='b', label='appointments')
ax.set_title('')
ax.set(ylabel='Attendance')
ax.set(xlabel='')
ax.set_xticks([0, 5, 10, 15, 20, 25, 29])
ax.set_xticks([1, 2, 3, 4, 6, 7, 8, 9], minor=True)
ax.legend(loc='upper right', fontsize='small')
ax2.legend(loc='lower right', fontsize='small')
plt.savefig('attendance.png')

In [None]:
# Check the appointment numbers alone.
ax2 = sns.lineplot(x='appointment_month', y='total_appointments', data=ar_pivot, color='b', label='appointments')

In [None]:
# Group the data based on hcp type.
ar_hcp = (ar['hcp_type'].value_counts())
ar_hcp

In [None]:
# Group the data by month, then status.
ar_monthly_hcp = ar.groupby(['appointment_month', 'hcp_type'])[['count_of_appointments']].agg('sum')
ar_monthly_hcp.reset_index(inplace=True)
ar_monthly_hcp

In [None]:
# Create a lineplot to display appointment status over time.
sns.lineplot(x='appointment_month',
             y='count_of_appointments', 
             data=ar_monthly_hcp, 
             hue='hcp_type')
plt.legend(bbox_to_anchor = (1.25, 0.6), loc='center right', fontsize='small')

In [None]:
# Group the data based on appointment mode.
ar_mode = (ar['appointment_mode'].value_counts())
ar_mode

In [None]:
# Group the data by month, then status.
ar_monthly_amod = ar.groupby(['appointment_month', 'appointment_mode'])[['count_of_appointments']].agg('sum')
ar_monthly_amod.reset_index(inplace=True)
ar_monthly_amod

In [None]:
# Create a lineplot to display appointment status over time.
ax=sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             data=ar_monthly_amod,
             hue='appointment_mode')
ax.set_title('')
ax.set(ylabel='count_of_appointments')
ax.set(xlabel='')
ax.set_xticks([0, 5, 10, 15, 20, 25, 29])
ax.set_xticks([1, 2, 3, 4, 6, 7, 8, 9], minor=True)
plt.legend(bbox_to_anchor = (1.25, 0.6), loc='center right', fontsize='small')
plt.savefig('monthly_appointment_mode.png')

In [None]:
# Check the time between booking and appointment category.
ar_book = (ar['time_between_book_and_appointment'].value_counts())
ar_book

In [None]:
# Group the data by month, then status.
ar_monthly_book = ar.groupby(['appointment_month', 'time_between_book_and_appointment'])[['count_of_appointments']].agg('sum')
ar_monthly_book.reset_index(inplace=True)
ar_monthly_book

In [None]:
# Create a lineplot to display time between booking and appointment.
ax=sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             data=ar_monthly_book,
             hue='time_between_book_and_appointment')
ax.set_title('')
ax.set(ylabel='count_of_appointments')
ax.set(xlabel='')
ax.set_xticks([0, 5, 10, 15, 20, 25, 29])
ax.set_xticks([1, 2, 3, 4, 6, 7, 8, 9], minor=True)
plt.legend(bbox_to_anchor = (1.25, 0.6), loc='center right', fontsize='small')
plt.savefig('monthly_booking.png')

### 3. Explore the national categories data.

In [None]:
# view and validate the dataframe.
print(nc.shape)
print(nc.dtypes)
nc.head()

In [None]:
# Find out the number of ons locations.
nc_locations_ons = nc['icb_ons_code'].value_counts()
print(nc_locations_ons.shape)

In [None]:
# Find and print out the number of locations based on sub location name.
nc_locations_sub = nc['sub_icb_location_name'].value_counts()

print("Count of locations: ")
print(nc_locations_sub.shape)

In [None]:
# Calculate the number of appointments
nc['count_of_appointments'].sum()

In [None]:
# Sense check the data.
nc.describe()

In [None]:
# Find out which five location has the highest number of records.
print(nc['sub_icb_location_name'].value_counts())

In [None]:
# Find out which location has the highest number of appointments.
nc_group_sub_location_name = nc.groupby('sub_icb_location_name')[['count_of_appointments']].agg('sum')
nc_group_sub_location_name.sort_values(by=['count_of_appointments'], inplace=True, ascending=False)
nc_group_sub_location_name[0:106]

In [None]:
# Compare distribution to the ar dataframe.
sns.displot(data=nc_group_sub_location_name, x='count_of_appointments')

In [None]:
# Find the highest and lowest number of records by date.
print(nc['appointment_date'].value_counts())

In [None]:
# Change the datatype for the appointment date column.
nc['appointment_date'] = pd.to_datetime(nc['appointment_date']).dt.date

In [None]:
# Find the earliest and latest records.
print(nc['appointment_date'].min())
print(nc['appointment_date'].max())

In [None]:
# Group the data based on appointment month.
nc_month = (nc['appointment_month'].value_counts(sort=False))
nc_month

In [None]:
# Group monthly using the number of appointments.
nc_monthly = nc.groupby('appointment_month')[['count_of_appointments']].agg('sum')
nc_monthly.reset_index(inplace=True)
nc_monthly

In [None]:
# Create a visual of the monthly data in the nc dataframe.
sns.lineplot(x='appointment_month', y='count_of_appointments', data=nc_monthly)

In [None]:
# Group the data based on appointment month.
nc_service = (nc['service_setting'].value_counts(sort=False))
nc_service

In [None]:
# Count service settings.
print('Number of service settings:', (nc_service.count()))

In [None]:
# Group the data based on appointment month.
nc_context = (nc['context_type'].value_counts(sort=False))
nc_context

In [None]:
# Count context types.
print('Number of context types:', (nc_context.count()))

In [None]:
# Group the data based on appointment month.
nc_category = (nc['national_category'].value_counts(sort=False))

In [None]:
# Count national category types.
print('Number of national categories:', (nc_category.count()))

In [None]:
# Group the data based on appointment month.
nc_category = (nc['national_category'].value_counts(sort=True))
nc_category

In [None]:
nc_category_apps = nc.groupby('national_category')[['count_of_appointments']].agg('sum')
nc_category_apps.reset_index(inplace=True)
nc_category_apps.sort_values(('count_of_appointments'), ascending=False)

In [None]:
ax = sns.barplot(x='count_of_appointments',
                 y='national_category',
                 data=nc_category_apps,
                 order=nc_category_apps.sort_values('count_of_appointments',
                                                    ascending=False).national_category, 
                 palette='Dark2')
ax.set_title('Appointment numbers by national category')
ax.set(xlabel='count_of_appointments')
ax.set(ylabel='')
plt.savefig('nc_categories_appointments.png')

# Assignment activity 3

In [None]:
# Check datatypes in columns.
print(ar.dtypes)

In [None]:
# Find the earliest and latest records for ad, already done in activity 2.
print('The ar DataFrame covers appointments from', 
      (ar['appointment_month'].min()), 
      'to', (ar['appointment_month'].max()))

In [None]:
# Find the earliest and latest records for ad, already done in activity 2.
print('The ad DataFrame covers appointments from', 
      (ad['appointment_date'].min()), 
      'to', (ad['appointment_date'].max()))

In [None]:
# Find the earliest and latest records for nc, already done in activity 2.
print('The nc DataFrame covers appointments from', 
      (nc['appointment_date'].min()), 
      'to', (nc['appointment_date'].max()))

In [None]:
# Create a subset to examine one location in a certain period.
nc_subset = nc[nc['sub_icb_location_name'] == 'NHS North West London ICB - W2U3Z']
nc_subset

In [None]:
# Filter the date values.
nc_sub_filtered = nc_subset[(nc_subset['appointment_date']>=datetime.date(2022,1,1))
                            & (nc_subset['appointment_date']<=datetime.date(2022,6,1))] 
nc_sub_filtered

In [None]:
# Group by service setting.
nc_final = nc_sub_filtered.groupby('service_setting').agg('sum')
nc_final.reset_index(inplace=True)
nc_final.sort_values(by='count_of_appointments', ascending=False)

In [None]:
# Sense check the data and compare it to the 
nc_final.describe()

In [None]:
# Compare weight of Unmapped service settings to related visuals.
sns.barplot(x='service_setting', y='count_of_appointments', data=nc_final)

In [None]:
# Repeat grouping by a different feature.
nc_final2 = nc_sub_filtered.groupby('context_type').agg('sum')
nc_final2.loc['total'] = nc_final2.sum()
nc_final2.sort_values(by='context_type')

In [None]:
# Repeat grouping by a different feature.
nc_final3 = nc_sub_filtered.groupby('national_category').agg('sum')
nc_final3.sort_values(by='count_of_appointments', ascending=False)

In [None]:
## Leaving the local subset and using the nc dataset again.
# Number of appointments per month == sum of count_of_appointments by month.
nc_daily_apps = nc.groupby('appointment_date')[['count_of_appointments']].agg('sum')
nc_daily_apps.sort_values(by='count_of_appointments', ascending=False)

In [None]:
# Check datatypes in columns.
print(nc.dtypes)

In [None]:
# Change the datatype for the appointment date column.
nc['appointment_date'] = pd.to_datetime(nc['appointment_date'])

print(nc.dtypes)

In [None]:
#Calculate the number of appointments per month.
nc_monthly_apps = nc.groupby([nc['appointment_date'].dt.year, nc['appointment_date'].dt.month]).agg('sum')
nc_monthly_apps.loc['total'] = nc_monthly_apps.sum()
nc_monthly_apps

In [None]:
#Calculate the number of records per month.
nc_monthly_apps = nc.groupby([nc['appointment_date'].dt.year, nc['appointment_date'].dt.month]).agg('count')
nc_monthly_apps.loc['total'] = nc_monthly_apps.sum()
nc_monthly_apps.iloc[:,[0]]

# Assignment activity 4

In [None]:
# Import the libraries.
import pandas as pd
import datetime
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Load the data set.
ad = pd.read_csv('actual_duration.csv')
ar = pd.read_csv('appointments_regional.csv')
nc = pd.read_excel('national_categories.xlsx')

In [None]:
# View and validate the 'ad' dataframe.
print(ad.shape)
print(ad.dtypes)
ad.head()

In [None]:
# Set figure size.
sns.set(rc={'figure.figsize':(15, 12)})

# Set the plot style as white.
sns.set_style('white')

In [None]:
# Validate the nc dataframe.
nc

In [None]:
# Change column data type for visualisation.
nc[["appointment_month"]] = nc[["appointment_month"]].astype(str) 

In [None]:
print(nc.dtypes)

In [None]:
# Group by service setting.
nc_ss = nc.groupby(['appointment_month','service_setting'])[['count_of_appointments']].agg('sum')
nc_ss.reset_index(inplace=True)
nc_ss.head()      

In [None]:
# Create a lineplot to show change of service settings over time.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='service_setting',
             data = nc_ss,
             ci=None)
plt.savefig('monthly_appointments.png')

In [None]:
# Group by context_type.
nc_ct = nc.groupby(['appointment_month','context_type'])[['count_of_appointments']].agg('sum')
nc_ct.reset_index(inplace=True)
nc_ct[0:5]  

In [None]:
# Create a lineplot to show change of context type over time.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='context_type', 
             data = nc_ct, ci=None)

plt.savefig('context_type.png')

In [None]:
# Group by national categories.
nc_nc = nc.groupby(['appointment_month','national_category'])[['count_of_appointments']].agg('sum')
nc_nc.reset_index(inplace=True)
nc_nc.head()

In [None]:
# Create a lineplot to show change of national categories over time.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='national_category', 
             data = nc_nc,
             palette='Dark2',
             ci=None)
plt.legend(bbox_to_anchor=(1,1), loc="upper left", fontsize=18)
plt.savefig('monthly_ncategories.png')

In [None]:
# Explore daily figures for service setting.
nc_ss_day = nc.groupby(['appointment_month','appointment_date','service_setting'])[['count_of_appointments']].agg('sum')
nc_ss_day.reset_index(inplace=True)
nc_ss_day

In [None]:
print(nc_ss_day.dtypes)
print(nc_ss_day.shape)

In [None]:
# Show daily figures on visual.
sns.lineplot(x='appointment_date',
             y='count_of_appointments',
             hue='service_setting', 
             data=nc_ss_day,
             ci=None)

### Objective 2

**Create four visuals for the different seasons**

In [None]:
# Show August 2021.
sns.lineplot(x='appointment_date',
             y='count_of_appointments',
             hue = 'service_setting',
             data = nc_ss_day[nc_ss_day['appointment_month']=='2021-08-01'],
             ci=None)

plt.savefig('service_setting_summer.png')

In [None]:
# Show October 2021.
sns.lineplot(x='appointment_date',
             y='count_of_appointments',
             hue = 'service_setting',
             data = nc_ss_day[nc_ss_day['appointment_month']=='2021-10-01'],
             ci=None)

plt.savefig('service_setting_autumn.png')

In [None]:
# Show January 2022.
sns.lineplot(x='appointment_date',
             y='count_of_appointments',
             hue = 'service_setting',
             data = nc_ss_day[nc_ss_day['appointment_month']=='2022-01-01'],
             ci=None)

plt.savefig('service_setting_winter.png')

In [None]:
# Show April 2022.
sns.lineplot(x='appointment_date',
             y='count_of_appointments',
             hue = 'service_setting',
             data = nc_ss_day[nc_ss_day['appointment_month']=='2022-04-01'],
             ci=None)

plt.savefig('service_setting_spring.png')

In [None]:
# Test another format of visuals.
sns.barplot(x='appointment_date',
             y='count_of_appointments',
             hue = 'service_setting',
             data = nc_ss_day[nc_ss_day['appointment_month']=='2022-04-01'],
             ci=None)

# Assignment activity 5

In [None]:
# Libraries and settings needed for analysis
# Import the libraries.
import pandas as pd
import datetime
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Set figure size.
sns.set(rc={'figure.figsize':(15, 12)})

# Set the plot style as white.
sns.set_style('white')

# Maximum column width to display
pd.options.display.max_colwidth = 200

In [None]:
# Load the csv file.
tweets = pd.read_csv('tweets.csv')
tweets.head

In [None]:
# Explore the file.
tweets.info()

In [None]:
# Analyse the file.
tweets.describe()

In [None]:
# Find number of retweets.
tweets['tweet_retweet_count'].value_counts()

In [None]:
# Find number of favorites.
tweets['tweet_favorite_count'].value_counts()

In [None]:
# Load the text only.
tweets_text = tweets[['tweet_full_text']]
tweets_text.head

In [None]:
# Collect the hashtags from the text.
tags = []
for y in [x.split(' ') for x in tweets['tweet_full_text'].values]:
    for z in y:
        if '#' in z:
            # Change to lowercase.
            tags.append(z.lower())

print(tags[:30])         

In [None]:
# Create a series object.
tags = pd.Series(tags)
tags

In [None]:
# View the top 30 hashtags.
print(tags[:30])

In [None]:
# Check how many tweets.
tags.value_counts()

In [None]:
# Create a new dataframe.
data = pd.DataFrame(tags.value_counts().reset_index().values, columns=['word','count'])

In [None]:
data.head(30)

In [None]:
data.info()

In [None]:
# Change column data type.
data['count'] = data['count'].astype(int)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Create a barplot from the hashtags.
ax = sns.barplot(x='word', y='count', data=data[data['count']>10])

In [None]:
# Improve the barplot.
ax = sns.barplot(x='count', y='word', data=data[data['count']>10])

In [None]:
# View overrepresented hashtags.
data.head()

In [None]:
# Remove two outliers.
data_filtered = data.iloc[2:]
data_filtered.head()

In [None]:
# Filter data to show the top trending hashtags with more than 11 tweets.
ax = sns.barplot(x='count',
                 y='word',
                 data=data_filtered[data_filtered['count']>11],
                 palette='light:b_r')
ax.set_title('Top trending on Twitter')
ax.set(xlabel='Hashtag counts')
ax.set(ylabel='')
sns.set_style('white')
plt.savefig('twitter_trend.png')

# Assignment activity 6

In [None]:
# Import the Pandas and DateTime libraries.
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data set.
ad = pd.read_csv('actual_duration.csv')
ar = pd.read_csv('appointments_regional.csv')

In [None]:
print(ar.shape)
print(ar.dtypes)

In [None]:
# Make a note of the number of records for this period. Compare to nc
ar = ar[ar['appointment_month']>='2021-08']
print(ar.shape)

In [None]:
# View the first five rows of the dataframe.
ar[0:5]

In [None]:
# Find the earliest and latest records for the subset of ar.
print('The ar DataFrame covers appointments from', 
      (ar['appointment_month'].min()), 
      'to', (ar['appointment_month'].max()))

In [None]:
# View the dataframe.
ar

In [None]:
# Create new aggregate dataframe.
ar_agg = ar.groupby(['appointment_month',
            'hcp_type',
            'appointment_status',
            'appointment_mode',
            'time_between_book_and_appointment'])[['count_of_appointments']].agg('sum')
ar_agg.reset_index(inplace=True)
ar_agg

In [None]:
# Create dataframe to view appointments monthly.
ar_df= ar.groupby(['appointment_month'])[['count_of_appointments']].agg('sum')
ar_df.reset_index(inplace=True)
ar_df

In [None]:
# Calculate utilisation.
ar_df['utilisation'] = ar_df['count_of_appointments']/30
ar_df['utilisation_percent'] = ar_df['count_of_appointments']/30/12000
ar_df=round(ar_df, 1)
ar_df

In [None]:
ar_df.describe()

In [None]:
ar_df[["appointment_month"]] = ar_df[["appointment_month"]].astype(str)
ar_agg[["appointment_month"]] = ar_agg[["appointment_month"]].astype(str)

In [None]:
ar_agg

In [None]:
# Create visual with monthly appointments.
sns.lineplot(x='appointment_month', y='count_of_appointments', data=ar_agg, ci=None)

plt.savefig('analyse_appointments.png')

In [None]:
# Create visual of utilisation.
sns.lineplot(x='appointment_month', y='utilisation', data=ar_df)

plt.savefig('analyse_utilisation.png')

In [None]:
# Create visual of utilisation.
ax = sns.lineplot(x='appointment_month', y='utilisation_percent', data=ar_df, color='lightgrey')
ax.axhline(y=0, linewidth=1, color='lightgrey')
ax.axhline(y=100, linewidth=3, color='r')
plt.fill_between(ar_df.appointment_month.values, ar_df.utilisation_percent.values, color='lightgrey')
ax.set_title('Utilisation')
ax.set(xlabel='')
ax.set(ylabel='percent')
sns.set_style('whitegrid')
ax.set_xticks([0, 5, 10])
ax.set_xticks([1, 2, 3, 4, 6, 7, 8, 9], minor=True)

plt.savefig('analyse_util_percent.png')

In [None]:
# Create a visual on hcp type.
sns.lineplot(x='appointment_month', y='count_of_appointments', hue='hcp_type', data=ar_agg, ci=None)
sns.set_style('white')
plt.savefig('analyse_hcp.png')

In [None]:
# Create a visual on appointment status.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='appointment_status',
             data=ar_agg, 
             ci=None)

plt.savefig('analyse_appointment_status.png')

In [None]:
# Create a visual on appointment mode.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='appointment_mode',
             data=ar_agg, ci=None)

plt.savefig('analyse_appointment_mode.png')

In [None]:
# Analise time between booking and appointments.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='time_between_book_and_appointment',
             data=ar_agg, ci=None)

plt.savefig('analyse_booking_appointments.png')

In [None]:
# Explore monthly figures for service setting.
nc_with_gp = nc.groupby(['appointment_month','service_setting'])[['count_of_appointments']].agg('sum')
nc_with_gp.reset_index(inplace=True)
nc_with_gp.head()

In [None]:
# Create a visual from this weekly service setting data.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='service_setting',
             data = nc_with_gp, ci=None)

plt.savefig('monthly_gp_appointments.png')

In [None]:
# View weekly service setting data without the GP appointments.
sns.lineplot(x='appointment_month',
             y='count_of_appointments',
             hue='service_setting',
             data = nc_with_gp[nc_with_gp['service_setting'].isin(['Other','Unmapped',
                                                                   'Primary Care Network',
                                                                   'Extended Access Provision'])],
             ci=None)

plt.savefig('monthly_nogp_appointments.png')