### Prepping Data Challenge:  Book Shop Data Modelling (week 46)

### Requirements
- Input data
- Union all the Sales data together to form one row per item in a sale
  - This is the granularity of the data set throughout the whole challenge (56,350 rows)
- Join all other data sets in the workbook on to this data
  - Never let the number of rows change
    - You may need to disregard incomplete records or summarise useful data into a metric instead of including all the detail
- Remove any duplicate fields
- Output your resulting single table

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Input the data
#Union all the Sales data together to form one row per item in a sale
  
with pd.ExcelFile(r"\Dataprep\2021\Bookshop.xlsx") as xl:
    d = {}
    for s in [s for s in xl.sheet_names if 'Sales' not in s]:
        d[s] = pd.read_excel(xl, s)

    df_s = pd.concat([pd.read_excel(xl, s).assign(sheet_name=s) 
                       for s in xl.sheet_names if 'Sales' in s])

In [3]:
# This is the granularity of the data set throughout the whole challenge (56,350 rows)
df_s

Unnamed: 0,Sale Date,ISBN,Discount,ItemID,OrderID,sheet_name
0,2193-01-02,989-28-3705-007-2,,107020-91-8011,107020-1,Sales Q1
1,2193-01-02,989-28-79-11297-4,,107020-9-3293,107020-10,Sales Q1
2,2193-01-02,989-28-79-11297-4,,107020-91-4622,107020-11,Sales Q1
3,2193-01-02,989-28-79-18127-7,,107020-38-4663,107020-12,Sales Q1
4,2193-01-02,989-28-79-82197-5,0.15,107020-1-1485,107020-13,Sales Q1
...,...,...,...,...,...,...
13088,2193-12-31,989-28-229-5891-8,,107383-57-1559,107383-49786,Sales Q4
13089,2193-12-31,989-28-3705-222-9,,107383-73-7579,107383-49786,Sales Q4
13090,2193-12-31,989-28-3705-007-2,,107383-54-3838,107383-49787,Sales Q4
13091,2193-12-31,989-28-229-9769-6,,107383-75-7720,107383-49787,Sales Q4


In [4]:

d['Edition']['ISBN'].count() == d['Edition']['ISBN'].nunique()

d['Book']['BookID'].count() == d['Book']['BookID'].nunique()

d['Author']['AuthID'].count() == d['Author']['AuthID'].nunique()

d['Publisher']['PubID'].count() == d['Publisher']['PubID'].nunique()

(d['Info']['BookID1'] + d['Info']['BookID2'].astype(str)).count() == (d['Info']['BookID1'] + d['Info']['BookID2'].astype(str)).nunique()
    
d['Series']['SeriesID'].count() == d['Series']['SeriesID'].nunique()

np.True_

In [5]:
#Join all other data sets in the workbook on to this data
df = df_s.merge(d['Edition'], on='ISBN', how='left').merge(d['Book'], on='BookID', how='left')\
             .merge(d['Author'], on='AuthID', how='left').merge(d['Publisher'], on='PubID', how='left')\
             .merge(d['Award'].groupby('Title')['Year Won'].size().reset_index()\
                              .rename(columns={'Year Won' : 'Number of Awards Won (avg only)'}), on='Title', how='left')\
             .merge(d['Checkouts'].groupby('BookID')\
                                  .agg(Number_of_Months_Checked_Out=('CheckoutMonth', 'nunique'),
                                       Total_Checkouts=('Number of Checkouts', 'sum')).reset_index(),on='BookID', how='left')\
             .merge(d['Info'].assign(BookID=d['Info']['BookID1'] + d['Info']['BookID2'].astype(str),
                                     Staff_Comment=d['Info']['Staff Comment'].str.strip()),on='BookID', how='left')\
             .merge(d['Series'], on='SeriesID', how='left')\
             .merge(d['Ratings'].groupby('BookID').agg(Average_Rating=('Rating', 'mean'),
                                                       Number_of_Reviewers=('ReviewerID', 'nunique'),
                                                       Number_of_Reviews=('ReviewID', 'count')),on='BookID', how='left')\
             .drop(columns=['sheet_name', 'BookID1', 'BookID2', 'Staff Comment'])\
             .rename(columns={'BookID' : 'Book ID'})

In [6]:
#Remove any duplicate fields
df.columns = [c.replace('_', ' ') for c in df.columns]

In [7]:
df.head()

Unnamed: 0,Sale Date,ISBN,Discount,ItemID,OrderID,Book ID,Format,PubID,Publication Date,Pages,...,Genre,SeriesID,Volume Number,Staff Comment,Series Name,Planned Volumes,Book Tour Events,Average Rating,Number of Reviewers,Number of Reviews
0,2193-01-02,989-28-3705-007-2,,107020-91-8011,107020-1,HP265,Board book,CHP,2188-06-03,16,...,Childrens,,,Wilberforce isn't sure about bedtime in a new ...,,,,4.543651,1764,1764
1,2193-01-02,989-28-79-11297-4,,107020-9-3293,107020-10,TP887,Trade paperback,ESP,2192-08-25,1296,...,Fiction,,,,,,,3.195033,1369,1369
2,2193-01-02,989-28-79-11297-4,,107020-91-4622,107020-11,TP887,Trade paperback,ESP,2192-08-25,1296,...,Fiction,,,,,,,3.195033,1369,1369
3,2193-01-02,989-28-79-18127-7,,107020-38-4663,107020-12,AY135,Hardcover,ESP,2179-04-24,704,...,Fiction,,,,,,,3.860825,970,970
4,2193-01-02,989-28-79-82197-5,0.15,107020-1-1485,107020-13,TC188,Trade paperback,ESP,2186-12-05,469,...,Fiction,,,,,,,4.210452,708,708


In [8]:
#output the data
df.to_csv('wk46-output.csv', index=False)