In [None]:
# Import the Pandas and DateTime libraries.
import pandas as pd
import datetime

# Load the data set.
ad = pd.read_csv('actual_duration.csv')
ar = pd.read_csv('appointments_regional.csv')
nc = pd.read_excel('national_categories.xlsx')

# 1. Explore the actual duration data.

In [None]:
# View and validate the 'ad' dataframe.
print(ad.shape)
print(ad.dtypes)
ad.head()

In [None]:
# Find out the number of locations based on the 'ad' dataframe.
ad_locations = ad['sub_icb_location_name'].value_counts()

print(ad_locations.shape)

In [None]:
# Find out the number of sub location codes.
print(ad['sub_icb_location_code'].value_counts())

In [None]:
# Find out the number of ons locations.
ad_locations_ons = ad['icb_ons_code'].value_counts()
print(ad_locations_ons.shape)

In [None]:
# Calculate the number of appointments.
ad['count_of_appointments'].sum()

In [None]:
print(ad['count_of_appointments'].min())
print(ad['count_of_appointments'].max())

In [None]:
# Which location had the highest number of appointments?
group_sub_location = ad.groupby('sub_icb_location_ons_code')[['count_of_appointments']].agg('sum')

group_sub_location.sort_values(by=['count_of_appointments'], inplace=True, ascending=False)
group_sub_location

In [None]:
# List the top 10 location names by appointments.
group_sub_location_name = ad.groupby('sub_icb_location_name')[['count_of_appointments']].agg('sum')

group_sub_location_name.sort_values(by=['count_of_appointments'], inplace=True, ascending=False)
group_sub_location_name[0:10]

In [None]:
# Find out which location has the highest number of records.
print(ad['sub_icb_location_name'].value_counts())

In [None]:
# Find the highest and lowest number of records by date.
print(ad['appointment_date'].value_counts())

In [None]:
# Change the datatype for the appointment date column.
ad['appointment_date'] = pd.to_datetime(ad['appointment_date']).dt.date

In [None]:
# Find the earliest and latest records.
print(ad['appointment_date'].min())
print(ad['appointment_date'].max())

In [None]:
# Group the data based on duration.
print(ad['actual_duration'].value_counts())

In [None]:
ad.describe()

# 2. Explore the appointment regional data.

In [None]:
# View and validate the 'ar' dataframe.
print(ad.shape)
print(ad.dtypes)
ad.head()

In [None]:
# Find out the number of ons locations.
ar_locations_ons = ar['icb_ons_code'].value_counts()

print(ad_locations_ons.shape)

In [None]:
# Calculate the number of appointments
ar['count_of_appointments'].sum()

In [None]:
print(ar['count_of_appointments'].min())
print(ar['count_of_appointments'].max())

In [None]:
sorted_ar = ar.sort_values(by=['count_of_appointments'], ascending=False)
sorted_ar.head()

In [None]:
# Group the data based on duration.
ar_month = (ar['appointment_month'].value_counts(sort=False))
ar_month

In [None]:
# Group the data based on appointment status.
ar_status = (ar['appointment_status'].value_counts())
ar_status

In [None]:
print('Number of appointment statuses:', (ar_status.count()))

In [None]:
ar_monthly = ar.groupby('appointment_month')[['appointment_status']].agg('value_counts')
ar_monthly[0:20]

In [None]:
ar_monthly_apps = ar.groupby('appointment_month')[['count_of_appointments']].agg('sum')
ar_monthly_apps.loc['total'] = ar_monthly_apps.sum()
ar_monthly_apps

In [None]:
# Group the data based on hcp type.
ar_hcp = (ar['hcp_type'].value_counts())
ar_hcp

In [None]:
# Group the data based on hcp type.
ar_mode = (ar['appointment_mode'].value_counts())
ar_mode

In [None]:
ar_book = (ar['time_between_book_and_appointment'].value_counts())
ar_book

In [None]:
ar.describe()

# 3. Explore the national categories data.

In [None]:
# view and validate the dataframe.
print(nc.shape)
print(nc.dtypes)
nc.head()

In [None]:
# Find out the number of ons locations.
nc_locations_ons = nc['icb_ons_code'].value_counts()
print(nc_locations_ons.shape)

In [None]:
# Find and print out the number of locations based on sub location name.
nc_locations_sub = nc['sub_icb_location_name'].value_counts()

print("Count of locations: ")
print(nc_locations_sub.shape)

In [None]:
# Calculate the number of appointments
nc['count_of_appointments'].sum()

In [None]:
# Find out which location has the highest number of appointments.
nc_group_sub_location_name = nc.groupby('sub_icb_location_name')[['count_of_appointments']].agg('sum')

nc_group_sub_location_name.sort_values(by=['count_of_appointments'], inplace=True, ascending=False)
nc_group_sub_location_name[0:106]

In [None]:
# Find out which five location has the highest number of records.
print(nc['sub_icb_location_name'].value_counts())

In [None]:
# Find the highest and lowest number of records by date.
print(nc['appointment_date'].value_counts())

In [None]:
# Change the datatype for the appointment date column.
nc['appointment_date'] = pd.to_datetime(nc['appointment_date']).dt.date

In [None]:
# Find the earliest and latest records.
print(nc['appointment_date'].min())
print(nc['appointment_date'].max())

In [None]:
# Group the data based on appointment month.
nc_month = (nc['appointment_month'].value_counts(sort=False))
nc_month

In [None]:
nc_monthly = nc.groupby('appointment_month')[['count_of_appointments']].agg('sum')
nc_monthly.loc['total'] = nc_monthly.sum()
nc_monthly

In [None]:
# Group the data based on appointment month.
nc_service = (nc['service_setting'].value_counts(sort=False))
nc_service

In [None]:
print('Number of service settings:', (nc_service.count()))

In [None]:
# Group the data based on appointment month.
nc_context = (nc['context_type'].value_counts(sort=False))
nc_context

In [None]:
# Count types.
print('Number of context types:', (nc_context.count()))

In [None]:
# Group the data based on appointment month.
nc_category = (nc['national_category'].value_counts(sort=False))
print('Number of national categories:', (nc_category.count()))

In [None]:
# Group the data based on appointment month.
nc_category = (nc['national_category'].value_counts(sort=True))
nc_category

In [None]:
nc_category_apps = nc.groupby('national_category')[['count_of_appointments']].agg('sum')
nc_category_apps.sort_values(('count_of_appointments'), ascending=False)

In [None]:
nc.describe()

In [None]:
# Check for missing data.
ad_na = ad[ad.isna().any(axis=1)]
ar_na = ar[ar.isna().any(axis=1)]
nc_na = nc[nc.isna().any(axis=1)]

print(ad_na.shape)
print(ar_na.shape)
print(nc_na.shape)

# Activity 3

In [None]:
# Find the earliest and latest records for ad, already done in activity 2.
print('The ad DataFrame covers appointments from', 
      (ad['appointment_date'].min()), 
      'to', (ad['appointment_date'].max()))

In [None]:
# Find the earliest and latest records for nc, already done in activity 2.
print('The nc DataFrame covers appointments from', 
      (nc['appointment_date'].min()), 
      'to', (nc['appointment_date'].max()))

In [None]:
nc_subset = nc[nc['sub_icb_location_name'] == 'NHS North West London ICB - W2U3Z']
nc_subset

In [None]:
nc_sub_filtered = nc_subset[(nc_subset['appointment_date']>=datetime.date(2022,1,1))
                            & (nc_subset['appointment_date']<=datetime.date(2022,6,1))] 
nc_sub_filtered

In [None]:
nc_final = nc_sub_filtered.groupby('service_setting').agg('count')
nc_final.loc['total'] = nc_final.sum()
nc_final.sort_values(by='service_setting')

In [None]:
nc_final2 = nc_sub_filtered.groupby('context_type').agg('count')
nc_final2.loc['total'] = nc_final2.sum()
nc_final2.sort_values(by='context_type')

In [None]:
nc_final3 = nc_sub_filtered.groupby('national_category').agg('count')
nc_final3.loc['total'] = nc_final3.sum()
nc_final3.sort_values(by='national_category')

In [None]:
# Number of appointments per month == sum of count_of_appointments by month.
# Use the groupby() and sort_values() functions.

nc_daily_apps = nc.groupby('appointment_date')[['count_of_appointments']].agg('sum')
nc_daily_apps


In [None]:
print(nc.dtypes)

In [None]:
# Change the datatype for the appointment date column.
nc['appointment_date'] = pd.to_datetime(nc['appointment_date'])

print(nc.dtypes)

In [None]:
#Calculate the number of appointments per month.
nc_monthly_apps = nc.groupby([nc['appointment_date'].dt.year, nc['appointment_date'].dt.month]).agg('sum')
nc_monthly_apps.loc['total'] = nc_monthly_apps.sum()
nc_monthly_apps

In [None]:
#Calculate the number of records per month.
nc_monthly_apps = nc.groupby([nc['appointment_date'].dt.year, nc['appointment_date'].dt.month]).agg('count')
nc_monthly_apps.loc['total'] = nc_monthly_apps.sum()
nc_monthly_apps.iloc[:,[0]]