### Prepping Data Challenge: Departmental December - IT (week 51)

### Requirements
- Input the Data
- Split out the store name from the OrderID
- Turn the Return State field into a binary Returned field
- Create a Sales field
- Create 3 dimension tables for Store, Customer and Product
     - When assigning IDs, these should be created using the dimension and minimum order date fields so that the IDs do not change when later orders are placed
     - For the Customer dimension table, we want to include additional fields detailing their total number of orders and the % of products they have returned
- Replace the dimensions with their IDs in the original dataset to create the fact table
- Output the fact and dimension tables

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Input the data

df = pd.read_csv(r"\Dataprep\2021\2021W51 Input.csv", parse_dates=['Order Date'], dayfirst=True).rename(columns={'OrderID' : 'OrderID_in', 'Unit Price' : 'Unit Price_in'})

In [3]:
df.head()

Unnamed: 0,OrderID_in,Customer,Order Date,Return State,Category,Sub-Category,Product Name,Unit Price_in,Quantity
0,Edinburgh-1347,Mr Schneider,2021-01-24,,Furniture,Bookcases,Bookcase,£60.07,7
1,Newcastle-1924,Mrs Balk,2021-01-15,,Furniture,Bookcases,Bookcase,£60.07,7
2,London-2097,Mrs Foster,2021-01-07,,Furniture,Bookcases,Bookcase,£60.07,10
3,London-1704,Mr Badders,2021-01-14,,Furniture,Chairs,Armless Task Chair,£37.75,11
4,Bristol-1911,Mr Norvell,2021-01-23,Return Processed,Furniture,Chairs,Armless Task Chair,£37.75,4


In [4]:
# split out the store name from the OrderID
df[['Store', 'OrderID']] = df['OrderID_in'].str.extract(r'(\D+)-(\d+)', expand=True)

In [5]:
# turn the Return State field into a binary Returned field
df['Returned'] = np.where(df['Return State'].notna(), 1, 0)

In [6]:
# create a Sales field
df['Unit Price'] = df['Unit Price_in'].str.replace(r'[^\d.\-]', '', regex=True).astype(float)
df['Sales'] = df['Unit Price'] * df['Quantity']

In [7]:
# create the Store dimension table
dfs = (df.groupby('Store')['Order Date'].min().reset_index().sort_values(by=['Order Date', 'Store'],
        key=lambda x: x.str.lower() if x.dtype == 'O' else x).rename(columns={'Order Date': 'First Order'}))
dfs['StoreID'] = range(1, len(dfs) + 1)

In [8]:
dfs.head()

Unnamed: 0,Store,First Order,StoreID
0,Birmingham,2021-01-01,1
4,Manchester,2021-01-01,2
5,Newcastle,2021-01-01,3
1,Bristol,2021-01-02,4
2,Edinburgh,2021-01-02,5


In [9]:
# create the Customer dimension table
dfc = df.groupby('Customer').agg(Returned=('Returned', 'sum'),Order_Lines=('OrderID', 'count'),
                                     Number_of_Orders=('OrderID', 'nunique'),First_Order=('Order Date', 'min')).reset_index()\
            .sort_values(by=['First_Order', 'Customer'], key=lambda x: x.str.lower() if x.dtype == 'O' else x)
dfc.columns = [c.replace('_', ' ') for c in dfc.columns]
dfc['Return %'] = (dfc['Returned'] / dfc['Order Lines']).round(2)
dfc['CustomerID'] = range(1, len(dfc) + 1)

In [10]:
dfc.head()

Unnamed: 0,Customer,Returned,Order Lines,Number of Orders,First Order,Return %,CustomerID
11,Mr Barnes,4,11,4,2021-01-01,0.36,1
238,Mr Wiediger,0,2,1,2021-01-01,0.0,2
346,Mrs Philippe,3,9,4,2021-01-01,0.33,3
42,Mr Campbell,0,11,4,2021-01-02,0.0,4
173,Mr Norvell,6,15,4,2021-01-02,0.4,5


In [11]:
# create the Product dimension table
dfp = df.groupby(['Category', 'Sub-Category', 'Product Name']).agg(Unit_Price=('Unit Price', 'mean'),
                 First_Sold=('Order Date', 'min')).reset_index()\
            .sort_values(by=['First_Sold', 'Product Name'], key=lambda x: x.str.lower() if x.dtype == 'O' else x)
dfp.columns = [c.replace('_', ' ') for c in dfp.columns]
dfp['ProductID'] = range(1, len(dfp) + 1)

In [12]:
dfp.head()

Unnamed: 0,Category,Sub-Category,Product Name,Unit Price,First Sold,ProductID
19,Office Supplies,Art,Post-its,3.33,2021-01-01,1
43,Office Supplies,Storage,Stacking Storage Drawers,43.26,2021-01-01,2
35,Office Supplies,Paper,Xerox 1979,6.45,2021-01-01,3
38,Office Supplies,Paper,Xerox 208,7.14,2021-01-01,4
7,Furniture,Furnishings,Desk Lamp Bulbs,10.0,2021-01-02,5


In [13]:
# replace the dimensions with their IDs in the original dataset to create the fact table
df = df.merge(dfs[['StoreID', 'Store']], on='Store', how='left')\
       .merge(dfc[['CustomerID', 'Customer']], on='Customer', how='left')\
       .merge(dfp[['ProductID', 'Product Name']], on='Product Name', how='left')

In [14]:
df.head(10)

Unnamed: 0,OrderID_in,Customer,Order Date,Return State,Category,Sub-Category,Product Name,Unit Price_in,Quantity,Store,OrderID,Returned,Unit Price,Sales,StoreID,CustomerID,ProductID
0,Edinburgh-1347,Mr Schneider,2021-01-24,,Furniture,Bookcases,Bookcase,£60.07,7,Edinburgh,1347,0,60.07,420.49,5,61,14
1,Newcastle-1924,Mrs Balk,2021-01-15,,Furniture,Bookcases,Bookcase,£60.07,7,Newcastle,1924,0,60.07,420.49,3,33,14
2,London-2097,Mrs Foster,2021-01-07,,Furniture,Bookcases,Bookcase,£60.07,10,London,2097,0,60.07,600.7,6,11,14
3,London-1704,Mr Badders,2021-01-14,,Furniture,Chairs,Armless Task Chair,£37.75,11,London,1704,0,37.75,415.25,6,22,31
4,Bristol-1911,Mr Norvell,2021-01-23,Return Processed,Furniture,Chairs,Armless Task Chair,£37.75,4,Bristol,1911,1,37.75,151.0,4,5,31
5,Edinburgh-1965,Mr Farhat,2021-01-16,,Furniture,Chairs,Armless Task Chair,£37.75,3,Edinburgh,1965,0,37.75,113.25,5,36,31
6,London-1259,Mr Bensley,2021-01-20,,Furniture,Chairs,Leather Highback Executive Chair,£96.82,5,London,1259,0,96.82,484.1,6,41,45
7,York-1105,Mr Sunley,2021-01-23,,Furniture,Chairs,Task/Swivel Chairs,£43.29,7,York,1105,0,43.29,303.03,7,60,29
8,London-1141,Mr Crebassa,2021-01-30,Return Processed,Furniture,Chairs,Task/Swivel Chairs,£43.29,7,London,1141,1,43.29,303.03,6,79,29
9,Edinburgh-1984,Mrs Etezadi,2021-01-13,,Furniture,Chairs,Task/Swivel Chairs,£43.29,3,Edinburgh,1984,0,43.29,129.87,5,21,29


In [15]:
#output the data
df.to_csv('wk51-output.csv', index=False)