### Prepping Data Challenge:  Painting Bikes (week 39)

### Requirements
- Input the Data
- Create a Datetime field
- Parse the Bike Type and Batch Status for each batch
- Parse the Actual & Target values for each parameter. 
- Identify what time each of the different process stage's took place. Each process stage is provided with a start time, and there is no overlap between stages. Assume that the final process stage ends when the last update occurs.
- Output the data in a single table.

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Input the data
df = pd.read_csv("wk39-input.csv")

In [3]:
df.head(10)

Unnamed: 0,Batch No.,Data Type,Data Parameter,Data Value,Time,Date
0,7000,Result Data,Bike Type,Mountain,8:05:55,23/09/2021
1,7000,Result Data,Batch Status,1,8:07:57,23/09/2021
2,7000,Process Data,Name of Process Stage,Warming Up,8:10:30,23/09/2021
3,7000,Process Data,Target Temperature,91,8:11:23,23/09/2021
4,7000,Process Data,Actual Temperature,59.2,8:18:11,23/09/2021
5,7000,Process Data,Actual Temperature,72.9,8:28:08,23/09/2021
6,7000,Process Data,Actual Temperature,104,8:32:25,23/09/2021
7,7000,Process Data,Actual Temperature,137.55,8:49:45,23/09/2021
8,7000,Process Data,Actual Temperature,113,8:51:37,23/09/2021
9,7000,Process Data,Name of Process Stage,Shutter Open,8:55:25,23/09/2021


In [4]:
#Create a Datetime field
df['Datetime'] = df['Date']+', '+df['Time']

In [5]:
#Parse the Bike Type and Batch Status for each batch
df['Bike Type'] = np.where(df['Data Parameter'] == 'Bike Type', df['Data Value'], np.nan)
df['Bike Type'] = df.groupby("Batch No.")['Bike Type'].ffill()
df['Batch Status'] = np.where(df['Data Parameter'] == 'Batch Status', df['Data Value'], np.nan)
df['Batch Status'] = df.groupby("Batch No.")['Batch Status'].ffill().bfill()

In [6]:
#Parse the Actual & Target values for each parameter.
df['Name of Process Step'] = np.where(df['Data Parameter'] == 'Name of Process Stage', df['Data Value'], np.nan)
df['Name of Process Step'] = df['Name of Process Step'].ffill().bfill()

In [7]:
df = df[(df['Data Parameter'] != 'Bike Type') & (df['Data Parameter'] != 'Batch Status') & (df['Data Parameter'] !='Name of Process Stage')]

In [8]:
df["A/T"] = df['Data Parameter'].str.extract('([Target,Actual]*)')

In [9]:
df['Data Parameter'] = df['Data Parameter'].str.extract('[Target,Actual]*\s(.*)')

In [10]:
df.head(10)

Unnamed: 0,Batch No.,Data Type,Data Parameter,Data Value,Time,Date,Datetime,Bike Type,Batch Status,Name of Process Step,A/T
3,7000,Process Data,Temperature,91.0,8:11:23,23/09/2021,"23/09/2021, 8:11:23",Mountain,1,Warming Up,Target
4,7000,Process Data,Temperature,59.2,8:18:11,23/09/2021,"23/09/2021, 8:18:11",Mountain,1,Warming Up,Actual
5,7000,Process Data,Temperature,72.9,8:28:08,23/09/2021,"23/09/2021, 8:28:08",Mountain,1,Warming Up,Actual
6,7000,Process Data,Temperature,104.0,8:32:25,23/09/2021,"23/09/2021, 8:32:25",Mountain,1,Warming Up,Actual
7,7000,Process Data,Temperature,137.55,8:49:45,23/09/2021,"23/09/2021, 8:49:45",Mountain,1,Warming Up,Actual
8,7000,Process Data,Temperature,113.0,8:51:37,23/09/2021,"23/09/2021, 8:51:37",Mountain,1,Warming Up,Actual
10,7000,Process Data,Current,7.104,8:57:00,23/09/2021,"23/09/2021, 8:57:00",Mountain,1,Shutter Open,Target
11,7000,Process Data,Current,6.396,9:06:48,23/09/2021,"23/09/2021, 9:06:48",Mountain,1,Shutter Open,Actual
12,7000,Process Data,Current,9.875,9:15:29,23/09/2021,"23/09/2021, 9:15:29",Mountain,1,Shutter Open,Actual
13,7000,Process Data,Current,7.44,9:15:58,23/09/2021,"23/09/2021, 9:15:58",Mountain,1,Shutter Open,Actual


In [11]:
df['Data Value'] = df['Data Value'].astype(float)

In [12]:
#Identify what time each of the different process stage's took place. 
#Each process stage is provided with a start time, and there is no overlap between stages. 
#Assume that the final process stage ends when the last update occurs.
df['A/T value'] = df.groupby(["Batch No.",'Data Type','Data Parameter',"A/T",'Datetime'])['Data Value'].transform('sum')

In [13]:
df['Target'] = np.where(df["A/T"] == 'Target', df['A/T value'], np.nan)
df['Actual'] = np.where(df["A/T"] == 'Actual', df['A/T value'], np.nan)

In [14]:
output = df[["Batch No.",'Name of Process Step','Bike Type','Batch Status','Datetime','Data Parameter','Target','Actual']]

In [15]:
output.head(10)

Unnamed: 0,Batch No.,Name of Process Step,Bike Type,Batch Status,Datetime,Data Parameter,Target,Actual
3,7000,Warming Up,Mountain,1,"23/09/2021, 8:11:23",Temperature,91.0,
4,7000,Warming Up,Mountain,1,"23/09/2021, 8:18:11",Temperature,,59.2
5,7000,Warming Up,Mountain,1,"23/09/2021, 8:28:08",Temperature,,72.9
6,7000,Warming Up,Mountain,1,"23/09/2021, 8:32:25",Temperature,,104.0
7,7000,Warming Up,Mountain,1,"23/09/2021, 8:49:45",Temperature,,137.55
8,7000,Warming Up,Mountain,1,"23/09/2021, 8:51:37",Temperature,,113.0
10,7000,Shutter Open,Mountain,1,"23/09/2021, 8:57:00",Current,7.104,
11,7000,Shutter Open,Mountain,1,"23/09/2021, 9:06:48",Current,,6.396
12,7000,Shutter Open,Mountain,1,"23/09/2021, 9:15:29",Current,,9.875
13,7000,Shutter Open,Mountain,1,"23/09/2021, 9:15:58",Current,,7.44


In [16]:
#output the data
output.to_csv('wk39-output.csv', index=False)