In [None]:
import sys
!{sys.executable} -m pip install "hvplot==0.5.2" "holoviews==1.12.7" "bokeh==1.4.0" "panel==0.7.0"

In [None]:
import numpy as np
import pandas as pd
import hvplot.pandas
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

Reading in the data from all the datafiles and printing it to view the formatting of each file.

In [None]:
data_DailyCustomers = pd.read_csv('DailyCustomers.csv', index_col=0)
data_DailyCustomers.index = pd.to_datetime(data_DailyCustomers.index)

data_Marketing = pd.read_csv('StoreMarketing.csv', index_col=0)
data_Overheads = pd.read_csv('StoreOverheads.csv', index_col=0)
data_Size = pd.read_csv('StoreSize.csv', index_col=0)
data_Staff = pd.read_csv('StoreStaff.csv', index_col=0)

print(data_DailyCustomers.head())
print(data_Marketing.head())
print(data_Overheads.head())
print(data_Size.head())
print(data_Staff.head())

Sorting the stores by total annual customers to determine large, medium, and small stores to later be used in various segmented plots. Large, medium, and small refers to the annual number of customers for each store; ie. a store with many annual customers is a 'large' store and a store with few annual customers is a 'small' store.

In [None]:
Large_Stores = []
Medium_Stores = []
Small_Stores = []
categories = ['Large Stores', 'Medium Stores', 'Small Stores']
categories_selected = [[] for i in range(len(categories))]
for name in data_DailyCustomers.columns:
  total_customers = data_DailyCustomers[name].sum()
  if total_customers > 300000:
    category = 0
    Large_Stores.append(name)
  elif total_customers > 100000:
    category = 1
    Medium_Stores.append(name)
  else:
    category = 2
    Small_Stores.append(name)
  categories_selected[category].append(name)

for i in range(len(categories)):
  print(categories[i] + ': ' + str(categories_selected[i]))

Bar charts for all stores. This is a simple plot that will visualise the slight differences between the stores in each category based on each store's total annual customers.

In [None]:
data_DailyCustomers = data_DailyCustomers.reindex(data_DailyCustomers.sum().sort_values(ascending=False).index, axis=1)

for i, selected in enumerate(categories_selected):
  plt.figure(figsize=(8, 8))
  x_pos = np.arange(len(data_DailyCustomers[selected].columns))
  plt.bar(x_pos, data_DailyCustomers[selected].sum(), align='center')
  plt.xticks(x_pos, data_DailyCustomers[selected].columns, rotation=45)
  plt.xlabel('Stores', fontsize=18)
  plt.ylabel('Annual Customers', fontsize=18)
  plt.title(categories[i], fontsize=20)
  plt.show()

Line plots with 14-day rolling average and trendlines for all stores. These plots will highlight the average customer frequency over 2-week periods for each store. The trendlines are a decent indicator of how successful each store will continue to be.

In [None]:
pd.plotting.register_matplotlib_converters()

period = 14
rolling_average = data_DailyCustomers.rolling(window=period).mean()

for i, selected in enumerate(categories_selected):
  plt.figure(figsize=(14, 14))
  plt.plot(data_DailyCustomers[selected], linewidth=0.4)
  plt.gca().set_prop_cycle(None)
  for name in selected:
    x = np.arange(len(data_DailyCustomers[name]))
    z = np.polyfit(x, data_DailyCustomers[name], 1)
    trend = np.poly1d(z)
    plt.plot(data_DailyCustomers.index, trend(x), linestyle='--')
  plt.gca().set_prop_cycle(None)
  plt.plot(rolling_average[selected], linewidth=1.5)
  plt.xlabel('Date', fontsize=18)
  plt.ylabel('Number of Customers', fontsize=18)
  plt.title(categories[i] + ' w/ 14-day Rolling Averages & Trendlines', fontsize=20)
  plt.legend(selected, loc='center left', bbox_to_anchor=(1, 0.5))
  plt.show()

Pie chart for all stores. A neat way to visualise the percentage of customers that shop at each store. All of the 'small stores' have been compiled into one slice of the pie to prevent cluttering the chart.

In [None]:
explodeList = []
selected = []
columns = data_DailyCustomers.columns
data_DailyCustomers['Small_Stores'] = [0] * len(data_DailyCustomers.index)
for name in columns:
  total_customers = data_DailyCustomers[name].sum()
  if total_customers > 100000:
    selected.append(name)
    explodeList.append(0)
  else:
    data_DailyCustomers['Small_Stores'] += data_DailyCustomers[name]
selected.append('Small_Stores')
explodeList.append(0.05)

plt.figure(figsize=(14, 14))
plt.pie(data_DailyCustomers[selected].sum(), labels=selected, autopct='%1.1f%%', startangle=90, explode=explodeList)
plt.title('Percentage of Total Customers Handled at each Store', fontsize=20)
plt.show()

Scatter plots for the large and medium stores. These will show if there is any correlation regarding the number of daily customers between 2 stores.

In [None]:
counter = 1
fig = plt.figure(figsize=(10, 10))
fig.suptitle('Large Store Correlations', fontsize=14, position=(0.5, 1.0))
for i, name_i in enumerate(Large_Stores):
  for j in range(i + 1, len(Large_Stores)):
    name_j = Large_Stores[j]
    sub = fig.add_subplot(3, 3, counter)
    sub.set_title(name_i + ' vs ' + name_j, fontsize=10)
    sub.scatter(data_DailyCustomers[name_i], data_DailyCustomers[name_j], s=1)
    counter += 1
plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.tight_layout()
plt.show()

In [None]:
counter = 1
fig = plt.figure(figsize=(12, 12))
fig.suptitle('Medium-Customer Store Correlations', fontsize=14, position=(0.5, 1.0))
for i, name_i in enumerate(Medium_Stores):
  for j in range(i + 1, len(Medium_Stores)):
    name_j = Medium_Stores[j]
    sub = fig.add_subplot(6, 6, counter)
    sub.set_title(name_i + ' vs ' + name_j, fontsize=10)
    sub.scatter(data_DailyCustomers[name_i], data_DailyCustomers[name_j], s=1)
    counter += 1
plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.tight_layout()
plt.show()

Interactive scatter subplots of the large stores. Each subplot compares 2 of the large stores' daily customer data points against each other to determine if there is a correlation. These plots will visualise similar information from the previous plot with the difference being that these plots are interactive.

In [None]:
plot = data_DailyCustomers.hvplot.scatter(
    frame_height=300, frame_width=300,
    x='RAH', y='QSN', title='RAH vs QSN',
    xlim=(600, 1500), ylim=(200, 1400), size=10
) + \
data_DailyCustomers.hvplot.scatter(
    frame_height=300, frame_width=300,
    x='RAH', y='SGA', title='RAH vs SGA',
    xlim=(600, 1500), ylim=(200, 1400), size=10
) + \
data_DailyCustomers.hvplot.scatter(
    frame_height=300, frame_width=300,
    x='RAH', y='SMM', title='RAH vs SMM',
    xlim=(600, 1500), ylim=(200, 1400), size=10
) + \
data_DailyCustomers.hvplot.scatter(
    frame_height=300, frame_width=300,
    x='QSN', y='SGA', title='QSN vs SGA',
    xlim=(600, 1500), ylim=(200, 1400), size=10
) + \
data_DailyCustomers.hvplot.scatter(
    frame_height=300, frame_width=300,
    x='QSN', y='SMM', title='QSN vs SMM',
    xlim=(600, 1500), ylim=(200, 1400), size=10
) + \
data_DailyCustomers.hvplot.scatter(
    frame_height=300, frame_width=300,
    x='SGA', y='SMM', title='SGA vs SMM',
    xlim=(600, 1500), ylim=(200, 1400), size=10
)
hvplot.show(plot)

Heatmap of the large and medium stores. This plot shows the percentage (Pearson Coefficient) of the correlations between each of the large and medium stores' daily customers.

In [None]:
pd.plotting.register_matplotlib_converters()

large_and_medium = Large_Stores + Medium_Stores

plt.figure(figsize=(10, 10))
corr = data_DailyCustomers[large_and_medium].corr()
ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(220, 20, n=200), square=True, annot=True,
                 annot_kws={"size": 8})
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

Box plots of large and medium stores. These will show approximate min and max x values to later be used in Histograms.

In [None]:
plt.figure(figsize=(8, 8))
plt.boxplot(data_DailyCustomers[Large_Stores].transpose(), labels=Large_Stores)
plt.xlabel('Stores', fontsize=18)
plt.ylabel('Daily Customers', fontsize=18)
plt.title('Large Stores Box Plot', fontsize=20)
plt.show()

Large stores:
*   x min - 275
*   x max - 1600

In [None]:
plt.figure(figsize=(8, 8))
plt.boxplot(data_DailyCustomers[Medium_Stores].transpose(), labels=Medium_Stores)
plt.xlabel('Stores', fontsize=18)
plt.ylabel('Daily Customers', fontsize=18)
plt.title('Medium Stores Box Plot', fontsize=20)
plt.show()

Medium stores:
*   x min - 100
*   x max - 700

Histograms for the large and medium stores. These will show how many times each store had specific numbers of customers.

In [None]:
x_min = 275
x_max = 1600
bin_width = 60
n_bins = int((bin_width + x_max - x_min) / bin_width)
print(str(n_bins) + ' bins')
bins = [(x_min + x * (bin_width + x_max - x_min) / n_bins) for x in range(int(n_bins))]
#print(bins)

fig = plt.figure(figsize=(12, 12))
fig.suptitle('Large Stores', fontsize=20, position=(0.5, 1.0))
counter = 1
for name in Large_Stores:
  sub = fig.add_subplot(2, 2, counter)
  sub.hist(data_DailyCustomers[name], bins, edgecolor='w')
  sub.set_title(name, fontsize=10)
  sub.set_xlim(xmin=x_min, xmax=x_max)
  sub.set_ylim(ymin=0, ymax=140)
  counter += 1
plt.show()

In [None]:
x_min = 100
x_max = 700
bin_width = 35
n_bins = int((bin_width + x_max - x_min) / bin_width)
print(str(n_bins) + ' bins')
bins = [(x_min + x * (bin_width + x_max - x_min) / n_bins) for x in range(int(n_bins))]
#print(bins)

fig = plt.figure(figsize=(12, 12))
fig.suptitle('Medium Stores', fontsize=20, position=(0.5, 1.0))
counter = 1
for name in Medium_Stores:
  sub = fig.add_subplot(3, 3, counter)
  sub.hist(data_DailyCustomers[name], bins, edgecolor='w')
  sub.set_title(name, fontsize=10)
  sub.set_xlim(xmin=x_min, xmax=x_max)
  sub.set_ylim(ymin=0, ymax=140)
  counter += 1
plt.subplots_adjust(wspace=0.4, hspace=0.4)
plt.show()

Autocorrelation for the high and medium customer stores. These plots will illuminate if each daily data point is correlated to the other data points. Additionaly, these plots may illustrate seasonality amongst the stores.

In [None]:
for name in Large_Stores:
    pd.plotting.autocorrelation_plot(data_DailyCustomers[name])
    #plt.xlim([0, 60]) # uncomment this line to zoom in
    plt.title(name)
    plt.show()

In [None]:
for name in Medium_Stores:
    plt.xlim([0, 60]) # uncomment this line to zoom in
    pd.plotting.autocorrelation_plot(data_DailyCustomers[name])
    plt.title(name)
    plt.show()

Radar subplots for the large and medium stores. The data from all datafiles are compiled into a dataframe to be visualised together. The goal of these plots is to shed light on correlations that may exist between the metrics: daily customers, store marketing, store overheads, store size, and store staff.

In [None]:
data_DailyCustomers = pd.read_csv('DailyCustomers.csv', index_col=0)
data_Marketing = pd.read_csv('StoreMarketing.csv', index_col=0)
data_Overheads = pd.read_csv('StoreOverheads.csv', index_col=0)
data_Size = pd.read_csv('StoreSize.csv', index_col=0)
data_Staff = pd.read_csv('StoreStaff.csv', index_col=0)

data_summary = pd.DataFrame(index=data_DailyCustomers.columns)
data_summary['Customers'] = data_DailyCustomers.sum().values
data_summary['Marketing'] = data_Marketing.values
data_summary['Size'] = data_Size.values
data_summary['Staff'] = data_Staff.values
data_summary['Overheads'] = data_Overheads.values

data_normalised = data_summary / data_summary.loc[Large_Stores].max()

n_attributes = len(data_normalised.columns)
angles = [n / float(n_attributes) * 2 * np.pi for n in range(n_attributes + 1)]
plt.figure(figsize=(8, 8))
counter = 1
for name in Large_Stores:
  values = data_normalised.loc[[name]].values.flatten().tolist()
  values += values[:1]
  sub = plt.subplot(2, 2, counter, polar=True)
  sub.plot(angles, values)
  sub.fill(angles, values, alpha=0.1)
  sub.set_ylim(ymax=1.05)
  sub.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
  sub.set_xticks(angles)
  sub.set_xticklabels(data_normalised.columns, fontsize=12)
  sub.set_title(name, fontsize=16, y=1.1)
  counter += 1
plt.tight_layout()
plt.show()

In [None]:
data_normalised = data_summary / data_summary.loc[Medium_Stores].max()

n_attributes = len(data_normalised.columns)
angles = [n / float(n_attributes) * 2 * np.pi for n in range(n_attributes + 1)]
plt.figure(figsize=(8, 8))
counter = 1
for name in Medium_Stores:
  values = data_normalised.loc[[name]].values.flatten().tolist()
  values += values[:1]
  sub = plt.subplot(3, 3, counter, polar=True)
  sub.plot(angles, values)
  sub.fill(angles, values, alpha=0.1)
  sub.set_ylim(ymax=1.05)
  sub.set_yticks([0.2, 0.4, 0.6, 0.8])
  sub.set_xticks(angles)
  sub.set_xticklabels(data_normalised.columns, fontsize=12)
  sub.set_title(name, fontsize=16, y=1.1)
  counter += 1
plt.tight_layout()
plt.show()

Radar subplots for the small stores.

In [None]:
data_normalised = data_summary / data_summary.loc[Small_Stores].max()

n_attributes = len(data_normalised.columns)
angles = [n / float(n_attributes) * 2 * np.pi for n in range(n_attributes + 1)]
plt.figure(figsize=(20, 20))
counter = 1
for name in Small_Stores:
  values = data_normalised.loc[[name]].values.flatten().tolist()
  values += values[:1]
  sub = plt.subplot(7, 4, counter, polar=True)
  sub.plot(angles, values)
  sub.fill(angles, values, alpha=0.1)
  sub.set_ylim(ymax=1.05)
  sub.set_yticks([0.2, 0.4, 0.6, 0.8])
  sub.set_xticks(angles)
  sub.set_xticklabels(data_normalised.columns, fontsize=12)
  sub.set_title(name, fontsize=16, y=1.1)
  counter += 1
plt.tight_layout()
plt.show()

Pair-plot of the metrics using the dataframe created in the previous segment. These plots will more precisely show how strong the correlations are between the metrics.

In [None]:
sns.pairplot(data_summary, height=1.75, plot_kws={'s': 20})
plt.show()

Heatmap of the metrics using the same dataframe. This plot will show Pearson Coefficients providing numeric values indicating the strength (or weakness) of the correlations between the metrics.

In [None]:
plt.figure(figsize=(12, 12))
corr = data_summary.corr()
ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(220, 20, n=200), square=True, annot=True, annot_kws={"size": 8})
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

Bubble plot for the large and medium stores. This plot displays the correlation between the size of the stores and the number of customers shopping there. The size of the bubble is determined by the amount each store spends on marketing annually. The labels for each bubble have been written vertically to prevent them from overlapping each other.

In [None]:
large_and_medium = Large_Stores + Medium_Stores
data_summary['BubbleSize'] = data_summary['Marketing'] * 0.1
data_summary = data_summary.loc[large_and_medium]

plt.figure(figsize=(12, 12))
plt.scatter(data_summary['Size'], data_summary['Customers'], s=data_summary['BubbleSize'], alpha=0.5)
plt.xticks([500, 1000, 2000, 3000, 4000, 5000, 6000])
plt.yticks([100000, 150000, 200000, 250000, 300000, 350000, 400000])
plt.xlim(500)
plt.ylim(100000)
plt.title('Store Size vs Annual Customers (vs Annual Marketing)', fontsize=20)
plt.xlabel('Store Size (meters squared)', fontsize=18)
plt.ylabel('Annual Customers', fontsize=18)
for i, name in enumerate(data_summary.index):
  plt.annotate(name + ' (£' + np.array2string(np.around(data_summary['Marketing'][i], decimals=2)) + ')',
               (data_summary['Size'][i], data_summary['Customers'][i]), rotation=270) # remove rotation for horizontal labels
plt.plot([0, 0], [0, 0], linestyle=':', color='r', label='')
plt.show()

Interactive Bubble plot for all of the stores. This again shows the correlation between store size and number of customers with bubble size determined by marketing. The vertical toolbar to the right of the plot allows interaction with the plot.

In [None]:
data_summary = pd.DataFrame(index=data_DailyCustomers.columns)
data_summary['Customers'] = data_DailyCustomers.sum().values
data_summary['Marketing'] = data_Marketing.values
data_summary['Size'] = data_Size.values
data_summary['Staff'] = data_Staff.values
data_summary['Overheads'] = data_Overheads.values

data_summary['BubbleSize'] = data_summary['Marketing'] * 0.1

plot = data_summary.hvplot.scatter(
    frame_height=600, frame_width=600,
    title='Store Size vs Annual Customers (vs Annual Marketing (£))',
    xlabel='Store Size (meters squared)', ylabel='Annual Customers',
    alpha=0.5, padding=0.1, hover_cols='all',
    x='Size', y='Customers', size='BubbleSize'
)
hvplot.show(plot)

Interactive Bubble Plot showing the correlation between store size and number of customers but with bubble size determined by the number of staff members at each store.

In [None]:
data_summary['BubbleSize'] = data_summary['Staff'] * 40

plot = data_summary.hvplot.scatter(
    frame_height=600, frame_width=600,
    title='Store Size vs Annual Customers (vs Staff Members)',
    xlabel='Store Size (meters squared)', ylabel='Annual Customers',
    alpha=0.5, padding=0.1, hover_cols='all',
    x='Size', y='Customers', size='BubbleSize'
)
hvplot.show(plot)

Interactive Bubble Plot showing the correlation between store size and number of customers but with bubble size determined by the annual amount spend on overheads from each store.

In [None]:
data_summary['BubbleSize'] = data_summary['Overheads'] * 0.01

plot = data_summary.hvplot.scatter(
    frame_height=600, frame_width=600,
    title='Store Size vs Annual Customers (vs Annual Overhead (£))',
    xlabel='Store Size (meters squared)', ylabel='Annual Customers',
    alpha=0.5, padding=0.1, hover_cols='all',
    x='Size', y='Customers', size='BubbleSize'
)
hvplot.show(plot)