### Prepping Data Challenge: Departmental December - Finance (week 48)

### Requirements
- Input the data
- Extract each data table within the Excel workbook
- Extract the branch name from the table structure
- Create a row per measure and year
- Remove the word 'Year' from the year values
- Create a True Value (i.e. the correct number of zeros for the measure)
- Remove the suffix of the measure (i.e. the (k) or (m) if the measure name has the units)
- Remove unneeded columns
- Output the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Input the data
  
with pd.ExcelFile(r"\Dataprep\2021\PD 2021 Wk 48 Input.xlsx") as xl:
    df = pd.read_excel(xl).dropna(axis=0, how='all').dropna(axis=1, how='all').reset_index(drop=True)

In [3]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Lewisham,Year 2020,Year 2021
1,Sales (m),3.2,3.5
2,Profit (k),470,480
3,Number of Staff,15,17
4,Wimbledon,Year 2020,Year 2021
5,Sales (m),4.3,4.1
6,Profit (k),530,520
7,Number of Staff,15,16
8,York,Year 2020,Year 2021
9,Sales (k),2700,3400


In [4]:
#Extract each data table within the Excel workbook
#Extract the branch name from the table structure
#Create a row per measure and year
#Remove the word 'Year' from the year values

df1 = df.iloc[1:4, 0:4].rename(columns=df.iloc[0, 0:4])
df1['Branch'] = df1.columns[0]
df1 = df1.rename(columns={'Lewisham' : 'Clean Measure Names'})

df2 = df.iloc[5:8, 0:4].rename(columns=df.iloc[4, 0:4])
df2['Branch'] = df2.columns[0]
df2 = df2.rename(columns={'Wimbledon' : 'Clean Measure Names'})

df3 = df.iloc[9:, 0:4].rename(columns=df.iloc[8, 0:4])
df3['Branch'] = df3.columns[0]
df3 = df3.rename(columns={'York' : 'Clean Measure Names'})

In [5]:
df1

Unnamed: 0,Clean Measure Names,Year 2020,Year 2021,Branch
1,Sales (m),3.2,3.5,Lewisham
2,Profit (k),470.0,480.0,Lewisham
3,Number of Staff,15.0,17.0,Lewisham


In [6]:
df2

Unnamed: 0,Clean Measure Names,Year 2020,Year 2021,Branch
5,Sales (m),4.3,4.1,Wimbledon
6,Profit (k),530.0,520.0,Wimbledon
7,Number of Staff,15.0,16.0,Wimbledon


In [7]:
df3

Unnamed: 0,Clean Measure Names,Year 2020,Year 2021,Branch
9,Sales (k),2700.0,3400.0,York
10,Profit (k),275.0,340.0,York
11,Number of Staff,10.0,12.0,York
12,Staff Cost (m),0.4,0.45,York


In [8]:
df_new = pd.concat([df1, df2, df3], axis=0).reset_index(drop=True)
df_new = df_new.melt(id_vars=['Branch', 'Clean Measure Names'], var_name='Recorded Year', value_name='True Value')
df_new['Recorded Year'] = df_new['Recorded Year'].str.replace('Year ', '')

In [9]:
#Create a True Value (i.e. the correct number of zeros for the measure)
df_new['True Value'] = np.where(df_new['Clean Measure Names'].str.contains(r'\(k\)'), 
                                df_new['True Value'] * 1000, 
                                np.where(df_new['Clean Measure Names'].str.contains(r'\(m\)'), 
                                         df_new['True Value'] * 1000000, 
                                         df_new['True Value'])).astype(int)

In [10]:
#Remove the suffix of the measure (i.e. the (k) or (m) if the measure name has the units)
df_new['Clean Measure Names'] = df_new['Clean Measure Names'].str.replace(r"\s\([a-zA-Z]\)", "", regex=True)

In [11]:
df_new

Unnamed: 0,Branch,Clean Measure Names,Recorded Year,True Value
0,Lewisham,Sales,2020,3200000
1,Lewisham,Profit,2020,470000
2,Lewisham,Number of Staff,2020,15
3,Wimbledon,Sales,2020,4300000
4,Wimbledon,Profit,2020,530000
5,Wimbledon,Number of Staff,2020,15
6,York,Sales,2020,2700000
7,York,Profit,2020,275000
8,York,Number of Staff,2020,10
9,York,Staff Cost,2020,400000


In [12]:
#output the data
df_new.to_csv('wk48-output.csv', index=False)