# README 1st
This notebook should be run cell by cell. The first cell reads the input files. The following cells implement different ways of combining the inputs, and you have to run the appropriate cell depending on what you want to do.

# Input

In [None]:
# '.' works on local computer when the price list is in the notebook folder
PRICE_LIST_DIR = '.' #@param {type:"raw"}

# comment out price list you don't want to use
#PRICE_LIST_FILE = 'price_list_biu.csv' #@param {type:"raw"}
PRICE_LIST_FILE = 'price_list_lmu.csv' #@param {type:"raw"}

INVOICE_DIR = '/work/data/OpenIRIS/LMU-20200527'
INVOICE_NAME = 'Invoice24.xlsx' 
#FIXES_FILE = 'fixed_Invoice24__price_type_missing.xlsx'
FIXES_FILE = 'LMU_consumables_sales.xlsx'

#INVOICE_DIR = '/work/data/OpenIRIS/BIU-20200430' 
#INVOICE_NAME = 'Invoice16.xlsx' 
#FIXES_FILE = 'fixed_Invoice16__overlapping_bookings.xlsx' 
#FIXES_FILE = 'fixed_Invoice16__group_or_wbs_missing.xlsx'

from datetime import datetime
import pandas as pd
from pathlib import Path
from utils import find_latest_invoice_version, save_invoice_with_timestamp

INVOICE_DIR = Path(INVOICE_DIR)
if not INVOICE_DIR.is_dir():
    raise ValueError('Please check INVOICE_DIR.')
INVOICE_FILE = INVOICE_DIR / INVOICE_NAME
if not INVOICE_FILE.exists():
    raise ValueError('Please check INVOICE_NAME.')

INVOICE_FILE = find_latest_invoice_version(INVOICE_FILE)

FIXES_FILE = INVOICE_DIR / FIXES_FILE
if not FIXES_FILE.exists():
    raise ValueError('Please check FIXES_FILE.')

PRICE_LIST_DIR = Path(PRICE_LIST_DIR)
if not PRICE_LIST_DIR.is_dir():
    raise ValueError('Please check PRICE_LIST_DIR.')
PRICE_LIST_FILE = PRICE_LIST_DIR / PRICE_LIST_FILE
if not PRICE_LIST_FILE.exists():
    raise ValueError('Please check PRICE_LIST_FILE.')

# read first two rows of input
header = pd.read_excel(INVOICE_FILE, nrows=1)

# check if the first two rows are the invoice summary from IRIS
if 'Created by' in header.columns:
    print('input file has IRIS summary, skip it')
    df = pd.read_excel(INVOICE_FILE, skiprows=[0,1])
else:
    print('no IRIS summary, read entire file')
    df = pd.read_excel(INVOICE_FILE)

df2 = pd.read_excel(FIXES_FILE)

# use the original index in column 'Unnamed: 0'
if 'Unnamed: 0' in df2.columns:
    df2.set_index('Unnamed: 0', inplace=True)
df2.head(3)


In [None]:
df2

# Modifications only (no added rows)

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html?highlight=update#pandas.DataFrame.update
df3 = df.copy()
df3.update(df2)

# save the modifications
OUTPUT = INVOICE_DIR / (INVOICE_FILE.stem + "__update__" + FIXES_FILE.stem + INVOICE_FILE.suffix)
df3.to_excel(OUTPUT,index=False)
print(OUTPUT)

# save the modifications to a timestamped version of invoice
save_invoice_with_timestamp(df3,INVOICE_FILE)


# Append split bookings
- df2 contains rows where parts of split bookings have the original IRIS charge ID.
- Rows that are not split will be ignored.
- df2 may contain fewer columns than df1. Missing columns will be read from df1.


In [None]:
df3 = df.copy()

# find rows in df2 with repeated ID
ids = df2["ID"]
splits = df2[ids.isin(ids[ids.duplicated()])].copy().reset_index()
# read these columns as datetime
splits['Booking start'] =  pd.to_datetime(splits['Booking start'], format='%Y-%m-%d %H:%M')
splits['Booking end'] =  pd.to_datetime(splits['Booking end'], format='%Y-%m-%d %H:%M')


split_IDs = splits["ID"].values
print("IDs of split lines:")
print(split_IDs)
print()

imap = {}
for s in split_IDs:
    imap[s] = int(df3[df3['ID'] == s].index[0])
print("mapping from split line ID to index in original data (df):")
print(imap)
print()

#df[df['ID'].isin(split_IDs)].sort_values("ID")
#df.loc[imap.values()].sort_values("ID")

idx = []
for s in split_IDs:
    idx.append(imap[s])
print("index in original data corresponding to splits (with repeats):")
print(idx)
print()

# all columns of split rows from original data
splitbase = df3.loc[idx].copy().reset_index()
print("splits (rows, cols):")
print(splits.shape)
print("splitbase (rows, cols):")
print(splitbase.shape)
print()

# if there are columns in df2 that are not present in df, add them (e.g. tmp_rebooked)
for c in df2.columns.values:
    if c.startswith('tmp_') and c not in df3.columns.values:
        print('adding column ' + c)
        splitbase[c] = None

# overwrite original with split data
splitbase.update(splits)

# remove unsplit lines from original
df3 = df3[~df3['ID'].isin(split_IDs)]

# append all splits to the original dataframe
df3 = df3.append(splitbase, ignore_index=True)



# save the modifications
OUTPUT = INVOICE_DIR / (INVOICE_FILE.stem + "__append_splits__" + FIXES_FILE.stem + INVOICE_FILE.suffix)
df3.to_excel(OUTPUT,index=False)
print(OUTPUT)

# save the modifications to a timestamped version of invoice
save_invoice_with_timestamp(df3,INVOICE_FILE)


In [None]:
splitbase



In [None]:
splits

# Add product sales

In [None]:
import pandas as pd
import re
from utils import check_totals


df3 = df.copy()

prices = pd.read_csv(PRICE_LIST_FILE, quotechar="'",)

print('Price types in use:')
print(df2['Price type'].unique())

# rename columns to match IRIS report
df2 = df2.rename(columns={'Item':'Resource', 'Date':'Creation date', \
                          'User':'User name', 'WBS':'Cost center code'})
# edit WBS
df2['Cost center code'] = df2['Cost center code'].str.replace('WBS','')

# drop rows with no date
df2 = df2[~df2['Creation date'].isnull()]

# drop rows with stock adjustment
df2 = df2[df2['Stock error fix'] != True]

df2 = df2[df2.columns.drop(['Stock error fix','GroupWBS'])]

df2['Creation date'] =  pd.to_datetime(df2['Creation date'], format='%Y-%m-%d %H:%M')

# Check that prices exist for all products
for r in df2['Resource'].unique():
    for pt in df2['Price type'].unique():
        for p in ['Prime-time']:
            try:
                price = prices[(prices['Instrument'] == r) & \
                               (prices['Price type'] == pt)][p].values[0]
            except:
                raise ValueError("Price missing: '%s' / %s / %s" % (r,pt,p))

def get_price(row):
    return prices[(prices['Instrument'] == row['Resource']) & \
                  (prices['Price type'] == row['Price type'])]['Prime-time'].values[0]

def calculate_charge(row):
    return round(row['Quantity'] * row['Price'], 2)

df2['Price'] = df2.apply(get_price, axis=1)
df2['Charge'] = df2.apply(calculate_charge, axis=1)



df3 = df3.append(df2)
output = save_invoice_with_timestamp(df3,INVOICE_FILE)

# remove timestamp to get original file name
stem = re.sub(r'__[0-9]{8}-[0-9]{6}','', output.stem)

# read original header
header = pd.read_excel(output.parent / (stem + '__header.xlsx'))

# fix header total
totals_wbs = check_totals(df,'after_sales',INVOICE_DIR, stem)
header['Total'] = str(totals_wbs) + " EUR"

# save header with timestamp
header.to_excel(output.parent / (output.stem + "__header.xlsx"), index=False)

df3.tail()

In [None]:
prices