In [79]:
# https://preppindata.blogspot.com/2021/05/2021-week-20-controlling-complaints.html

import pandas as pd
import numpy as np

### Input the data file

In [80]:
df = pd.read_csv(r'data\PD 2021 Wk 20 Input.csv')
df

Unnamed: 0,Date,Week,Complaints,Department
0,19/04/2021,16,42,Ticketing
1,20/04/2021,16,32,Ticketing
2,21/04/2021,16,51,Ticketing
3,22/04/2021,16,48,Ticketing
4,23/04/2021,16,34,Ticketing
...,...,...,...,...
100,19/05/2021,20,14,Airport Experience
101,20/05/2021,20,19,Airport Experience
102,21/05/2021,20,23,Airport Experience
103,22/05/2021,20,20,Airport Experience


### Function for transformation

In [81]:
def df_std_n(n):
    df_cal['The Upper Control Limit'] = df_cal['Mean'] + n * df_cal['Standard Deviation']
    df_cal['The Lower Control Limit'] = df_cal['Mean'] - n * df_cal['Standard Deviation']
    df_cal['Variation'] = df_cal['The Upper Control Limit'] - df_cal['The Lower Control Limit']

    df_output = pd.merge(df, df_cal, on='Week')
    df_output['Outlier?'] = np.where((df_output['Complaints']>df_output['The Upper Control Limit']) | (df_output['Complaints']<df_output['The Lower Control Limit'])
                                , 'Outside','Inside')
    df_output = df_output[df_output['Outlier?']=='Outside']

    df_output = df_output[['Variation','Outlier?', 'The Lower Control Limit','The Upper Control Limit','Standard Deviation','Mean','Date','Week','Complaints','Department']]
    Variation_label = 'Variation ('+str(n)+'SD)'
    Outlier_label = 'Outlier? ('+str(n)+'SD)'

    df_output.rename(columns={'Variation':Variation_label, 'Outlier?': Outlier_label},inplace=True)
    return df_output

### Produce a separate output worksheet (or csv) for 1, 2 or 3 standard deviations and remove the irrelevant fields for that output.

In [82]:
#testing
df_std_n(2)

Unnamed: 0,Variation (2SD),Outlier? (2SD),The Lower Control Limit,The Upper Control Limit,Standard Deviation,Mean,Date,Week,Complaints,Department
5,51.832698,Outside,3.797937,55.630635,12.958174,29.714286,24/04/2021,16,57,Ticketing
31,66.484513,Outside,4.567267,71.05178,16.621128,37.809524,29/04/2021,17,84,Onboard Experience
43,37.785359,Outside,34.440654,72.226013,9.44634,53.333333,04/05/2021,18,76,Ticketing
74,150.644583,Outside,-2.846101,147.798482,37.661146,72.47619,14/05/2021,19,230,Onboard Experience
89,48.939224,Outside,4.43515,53.374374,12.234806,28.904762,22/05/2021,20,68,Ticketing


In [83]:
df_std_n(1).to_csv(r'output/2021-week20-output1.csv')
df_std_n(2).to_csv(r'output/2021-week20-output2.csv')
df_std_n(3).to_csv(r'output/2021-week20-output3.csv')

------
# The following is the function Breakdown


### Create the mean and standard deviation for each Week

In [84]:
df_cal = df.groupby('Week')['Complaints'].agg(['mean', 'std']).reset_index()
df_cal.rename(columns={'mean':'Mean','std':'Standard Deviation'},inplace=True)

### Create the following calculations for each of 1, 2 and 3 standard deviations:
- The Upper Control Limit (mean+(n*standard deviation))
- The Lower Control Limit (mean-(n*standard deviation))
- Variation (Upper Control Limit - Lower Control Limit)

In [85]:
df_cal['The Upper Control Limit'] = df_cal['Mean'] + df_cal['Standard Deviation']
df_cal['The Lower Control Limit'] = df_cal['Mean'] - df_cal['Standard Deviation']
df_cal['Variation'] = df_cal['The Upper Control Limit'] - df_cal['The Lower Control Limit']

df_cal

Unnamed: 0,Week,Mean,Standard Deviation,The Upper Control Limit,The Lower Control Limit,Variation
0,16,29.714286,12.958174,42.67246,16.756111,25.916349
1,17,37.809524,16.621128,54.430652,21.188396,33.242257
2,18,53.333333,9.44634,62.779673,43.886994,18.892679
3,19,72.47619,37.661146,110.137336,34.815045,75.322292
4,20,28.904762,12.234806,41.139568,16.669956,24.469612


### Join the original data set back on to these results 

In [86]:
df_output = pd.merge(df, df_cal, on='Week')
df_output

Unnamed: 0,Date,Week,Complaints,Department,Mean,Standard Deviation,The Upper Control Limit,The Lower Control Limit,Variation
0,19/04/2021,16,42,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349
1,20/04/2021,16,32,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349
2,21/04/2021,16,51,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349
3,22/04/2021,16,48,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349
4,23/04/2021,16,34,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349
...,...,...,...,...,...,...,...,...,...
100,19/05/2021,20,14,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612
101,20/05/2021,20,19,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612
102,21/05/2021,20,23,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612
103,22/05/2021,20,20,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612


### Assess whether each of the complaint values for each Department, Week and Date is within or outside of the control limits

In [87]:
df_output['Outlier?'] = np.where((df_output['Complaints']>df_output['The Upper Control Limit']) | (df_output['Complaints']<df_output['The Lower Control Limit'])
                                , 'Outside','Inside')
df_output

Unnamed: 0,Date,Week,Complaints,Department,Mean,Standard Deviation,The Upper Control Limit,The Lower Control Limit,Variation,Outlier?
0,19/04/2021,16,42,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349,Inside
1,20/04/2021,16,32,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349,Inside
2,21/04/2021,16,51,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349,Outside
3,22/04/2021,16,48,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349,Outside
4,23/04/2021,16,34,Ticketing,29.714286,12.958174,42.672460,16.756111,25.916349,Inside
...,...,...,...,...,...,...,...,...,...,...
100,19/05/2021,20,14,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612,Outside
101,20/05/2021,20,19,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612,Inside
102,21/05/2021,20,23,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612,Inside
103,22/05/2021,20,20,Airport Experience,28.904762,12.234806,41.139568,16.669956,24.469612,Inside


### Output only Outliers

In [88]:
df_output = df_output[df_output['Outlier?']=='Outside']
df_output

Unnamed: 0,Date,Week,Complaints,Department,Mean,Standard Deviation,The Upper Control Limit,The Lower Control Limit,Variation,Outlier?
2,21/04/2021,16,51,Ticketing,29.714286,12.958174,42.67246,16.756111,25.916349,Outside
3,22/04/2021,16,48,Ticketing,29.714286,12.958174,42.67246,16.756111,25.916349,Outside
5,24/04/2021,16,57,Ticketing,29.714286,12.958174,42.67246,16.756111,25.916349,Outside
15,20/04/2021,16,12,Airport Experience,29.714286,12.958174,42.67246,16.756111,25.916349,Outside
16,21/04/2021,16,10,Airport Experience,29.714286,12.958174,42.67246,16.756111,25.916349,Outside
18,23/04/2021,16,14,Airport Experience,29.714286,12.958174,42.67246,16.756111,25.916349,Outside
21,26/04/2021,17,14,Ticketing,37.809524,16.621128,54.430652,21.188396,33.242257,Outside
23,28/04/2021,17,57,Ticketing,37.809524,16.621128,54.430652,21.188396,33.242257,Outside
25,30/04/2021,17,56,Ticketing,37.809524,16.621128,54.430652,21.188396,33.242257,Outside
27,02/05/2021,17,59,Ticketing,37.809524,16.621128,54.430652,21.188396,33.242257,Outside


In [89]:
#reorder column and rename
df_output = df_output[['Variation','Outlier?', 'The Lower Control Limit','The Upper Control Limit','Standard Deviation','Mean','Date','Week','Complaints','Department']]
Variation_label = 'Variation ('+str(1)+'SD)'
Outlier_label = 'Outlier? ('+str(1)+'SD)'

df_output.rename(columns={'Variation':Variation_label, 'Outlier?': Outlier_label})


Unnamed: 0,Variation (1SD),Outlier? (1SD),The Lower Control Limit,The Upper Control Limit,Standard Deviation,Mean,Date,Week,Complaints,Department
2,25.916349,Outside,16.756111,42.67246,12.958174,29.714286,21/04/2021,16,51,Ticketing
3,25.916349,Outside,16.756111,42.67246,12.958174,29.714286,22/04/2021,16,48,Ticketing
5,25.916349,Outside,16.756111,42.67246,12.958174,29.714286,24/04/2021,16,57,Ticketing
15,25.916349,Outside,16.756111,42.67246,12.958174,29.714286,20/04/2021,16,12,Airport Experience
16,25.916349,Outside,16.756111,42.67246,12.958174,29.714286,21/04/2021,16,10,Airport Experience
18,25.916349,Outside,16.756111,42.67246,12.958174,29.714286,23/04/2021,16,14,Airport Experience
21,33.242257,Outside,21.188396,54.430652,16.621128,37.809524,26/04/2021,17,14,Ticketing
23,33.242257,Outside,21.188396,54.430652,16.621128,37.809524,28/04/2021,17,57,Ticketing
25,33.242257,Outside,21.188396,54.430652,16.621128,37.809524,30/04/2021,17,56,Ticketing
27,33.242257,Outside,21.188396,54.430652,16.621128,37.809524,02/05/2021,17,59,Ticketing
