# Check and fix Open IRIS invoice

## Usage on Windows
- Install Anaconda (in Software Center on UH computers).
- Download this notebook and the invoice from Open IRIS in a folder.
- Start Menu -> Anaconda -> Anaconda shell.
- In the shell: "cd" to the folder with the notebook, e.g. "cd Documents\billing_check".
- In the shell: "jupyter notebook". This chould open the notebook in a browser.
- Change the invoice file name in the first notebook cell.
- Cell -> Run All.
- Wait for .xslx files to appear in the folder.

## Input
- set invoice file
- set holidays

In [1]:
from datetime import date,datetime,timedelta
import numpy as np
import os
import pandas as pd

# invoice from IRIS
filename = 'Invoice24.xlsx'

# holidays during billing period (have to list manually)
holidays = [\
    date(2019,12,6), \
    date(2019,12,24), \
    date(2019,12,25), \
    date(2019,12,26), \
#    date(2019,12,31), \
    date(2020,1,1), \
    date(2020,1,6) \
           ]

basename,ext = os.path.splitext(filename)

header = pd.read_excel(filename, nrows=1)
df = pd.read_excel(filename, skiprows=[0,1])

# exclude these groups
exclude_groups = ['Kimmo Tanhuanpää', 'LMU-staff','Group Raimi research inc']
df = df[~df['Group'].isin(exclude_groups)]

# essential columns shown in testing
summary = ['ID','User name','Resource','Booking start','Booking end','Price','Discount','Quantity','Price item','Charge','Group','Cost center name','Cost center code']

# store original report with essential columns
df[summary].to_excel("tmp_" + basename + "_summary" + ext, index=True) 


## Set price list
- This could be read from .csv as well.

In [2]:
PRIME_TIME = 'Prime-time'
OFF_HOURS = 'Off-hours'
NIGHT_TIME = 'Night time'

prices = pd.DataFrame ([\
    ('Leica SP8 STED',22,17,13), \
    ('Leica SP8 upright',20,16,12), \
    ('Leica TCS SP5 MP SMD FLIM',22,17,13), \
    ('Leica TCS SP5 II HCS-A',22,17,13), \
    ('Leica DM5000B',3,2,2), \
    ('Zeiss LSM700',18,14,10), \
    ('Zeiss Z.1 LightSheet Lasers',18,14,10), \
    ('Zeiss Z.1 LightSheet Data management',2,2,1), \
    ('3I Marianas base with lasers',18,14,10), \
    ('3I Marianas no lasers',9,7,5), \
    ('Aurox Clarity',9,7,5), \
    ('GE Deltavision Ultra',9,7,5), \
    ('MolecularDevices Nano',9,6,6), \
    ('Cell-IQ',2,2,2), \
    ('Cell-IQ fluorescence',2,2,2), \
    ('3D Workstation',5,3,3), \
    ('LS Workstation',5,3,3), \
    ('HELMI 3D VM',5,3,3), \
    ('HCA Workstation',3,1,1), \
    ('2D Workstation',3,1,1), \
    ('ONI Nanoimager S',18,14,14), \
    ('Sheep (TESTING AND DEVELOPMENT PURPOSE',0,0,0) \
                     ], \
    columns = ('Instrument',PRIME_TIME,OFF_HOURS,NIGHT_TIME)
    )

# Check that prices exist for all instruments
for r in df['Resource'].unique():
    for p in [PRIME_TIME,OFF_HOURS,NIGHT_TIME]:
        try:
            price = prices[prices['Instrument'] == r][p].values[0]
        except:
            raise ValueError("Price missing: %s %s" % (r,p))


## Functions and constants

In [3]:
# read these columns as datetime
df['Booking start'] =  pd.to_datetime(df['Booking start'], format='%Y-%m-%d %H:%M')
df['Booking end'] =  pd.to_datetime(df['Booking end'], format='%Y-%m-%d %H:%M')

# billing period
period_start = df['Booking start'].min().date()
period_end = df['Booking end'].max().date()

# weekends during billing period
weekends = []
test = period_start
while test < period_end:
    if test.weekday() >= 5:
        weekends.append(test)
    test = test + timedelta(days=1)
weekends

weekends_and_holidays = sorted(weekends + holidays)

instruments_with_night_discounts = prices[prices[OFF_HOURS] != prices[NIGHT_TIME]]['Instrument'].values
instruments_with_no_discounts = prices[(prices[PRIME_TIME] == prices[OFF_HOURS]) & \
                                             (prices[OFF_HOURS] == prices[NIGHT_TIME])]['Instrument'].values
#print(instruments_with_night_discounts)
#print(instruments_with_no_discounts)


def next_workday(dt):
    #print(weekends_and_holidays)
    test = dt
    while True:
        test = test + timedelta(days=1)
        #print(test.date())
        if test.date() not in weekends_and_holidays:
            return test

# get price type for a booking that has already been split
def get_price_item(row):
    if row['Resource'] in instruments_with_night_discounts:
        if row['Booking start'].hour < 8 or row['Booking start'].hour >= 22:
            return NIGHT_TIME
        if includes_holiday_or_weekend(row) or row['Booking start'].hour == 8 or row['Booking start'].hour >= 17:
            return OFF_HOURS
        else:
            return PRIME_TIME
    else:
        if includes_holiday_or_weekend(row) or row['Booking start'].hour < 9 or row['Booking start'].hour >= 17:
            return OFF_HOURS
        else:
            return PRIME_TIME
        
def get_price(row):
    return prices[prices['Instrument'] == row['Resource']][get_price_item(row)].values[0]

def get_discount_factor(row):
    discount = row['Discount']
    discount = discount.replace('%','')
    if(discount == 'nan'):
        return 1
    else:
        return 1 - float(discount)/100

df['Discount'] = df['Discount'].astype(str)
df['tmp_discount_factor'] = 1
df['tmp_discount_factor'] = df.apply(get_discount_factor, axis=1)

# splits between prime and off hours (no night discount)
def next_split_2(dt):
    # other to prime time
    if dt.hour >= 17:
        nwd = next_workday(dt)
        return datetime(nwd.year,nwd.month,nwd.day,9,0,0)
    if dt.hour < 9:
        if dt.date() not in weekends_and_holidays:
            return datetime(dt.year,dt.month,dt.day,9,0,0)
        else:
            nwd = next_workday(dt)
            return datetime(nwd.year,nwd.month,nwd.day,9,0,0) 
    # prime to other time
    if dt.hour < 17:
        if dt.date() not in weekends_and_holidays:
            return datetime(dt.year,dt.month,dt.day,17,0,0)
        else:
            nwd = next_workday(dt)
            return datetime(nwd.year,nwd.month,nwd.day,9,0,0) 
    
# splits between prime, off and night hours 
def next_split_3(dt):
    # night to other time
    if dt.hour >= 22:
        return datetime(dt.year,dt.month,dt.day,8,0,0) + timedelta(days=1)
    if dt.hour < 8:
        return datetime(dt.year,dt.month,dt.day,8,0,0)
    # other to prime time (or night time during holidays)
    if dt.hour < 9:
        if (dt.date() in weekends_and_holidays):
            return datetime(dt.year,dt.month,dt.day,22,0,0)
        else:
            return datetime(dt.year,dt.month,dt.day,9,0,0)
    # prime to other time (or night time during holidays)
    if dt.hour < 17:
        if (dt.date() in weekends_and_holidays):
            return datetime(dt.year,dt.month,dt.day,22,0,0)
        else:
            return datetime(dt.year,dt.month,dt.day,17,0,0)
    # other to night time
    if dt.hour < 22:
        return datetime(dt.year,dt.month,dt.day,22,0,0)
    
# does a row need prime/off split
def needs_split_2(row):
    if row['Resource'] in np.append(instruments_with_night_discounts,instruments_with_no_discounts):
        return False
    if next_split_2(row['Booking start']) < row['Booking end']:
        #print('start ' + str(row['Booking start']))
        #print('next ' + str(next_split_2(row['Booking start'])))
        #print('end ' + str(row['Booking end']))
        #print("yes")
        return True
    else:
        return False

# does a row need prime/off/night split
def needs_split_3(row):
    if not row['Resource'] in instruments_with_night_discounts:
        return False
    if next_split_3(row['Booking start']) < row['Booking end']:
        #print('start ' + str(row['Booking start']))
        #print('next ' + str(next_split_3(row['Booking start'])))
        #print('end ' + str(row['Booking end']))
        #print("yes")
        return True
    else:
        return False

def night_discount_applies(row):
    if row['Resource'] in instruments_with_night_discounts:
        return True
    else:
        return False

def off_hour_discount_applies(row):
    if row['Resource'] in instruments_with_no_discounts:
        return False
    else:
        return True
    
def is_night_1(row):
    if not night_discount_applies(row):
        return False
    #print(row.head())
    dt1 = row['Booking start']
    dt2 = row['Booking end']
    if dt2 > datetime(dt1.year, dt1.month, dt1.day, 22, 0, 0):
        return True
    else:
        return False

def is_night_2(row):
    if not night_discount_applies(row):
        return False
    #print(row.head())
    dt1 = row['Booking start']
    dt2 = row['Booking end']
    if (dt1.day < dt2.day) or (dt1.hour < 8):
        return True
    else:
        return False

def includes_holiday_or_weekend(row):
    d1 = row['Booking start'].date()
    d2 = row['Booking end'].date()

    for dt in weekends_and_holidays:
        if (d1 <= dt) and (d2 >= dt):
            return True
        
    return False

def split_3(rows):
    _summary = ['ID','Booking start','Booking end','Price']

    # make a copy, just to be sure
    rows = rows.copy()
    #print("rows:")
    #print(rows[_summary])
    
    rows['Booking start'] =  pd.to_datetime(rows['Booking start'], format='%Y-%m-%d %H:%M')
    rows['Booking end'] =  pd.to_datetime(rows['Booking end'], format='%Y-%m-%d %H:%M')

    tmp = rows.tail(1).copy()
    #print("tmp:")
    #print(tmp[_summary])
    tmp['Booking start'] =  pd.to_datetime(tmp['Booking start'], format='%Y-%m-%d %H:%M')
    tmp['Booking end'] =  pd.to_datetime(tmp['Booking end'], format='%Y-%m-%d %H:%M')
    i = tmp.index.values[0]
    #print(i)
    next_split = next_split_3(tmp.loc[i]['Booking start'])
    
    if tmp.loc[i]['Booking end'] <= next_split:
        #print("return split")
        return rows
    
    else:
        rows.at[i,'Booking end'] = next_split
        delta = rows.loc[i]['Booking end'] - rows.loc[i]['Booking start']
        rows.at[i,'Quantity'] = delta.seconds / 3600
        rows.at[i,'Price item'] = get_price_item(rows.loc[i])
        rows.at[i,'Price'] = get_price(rows.loc[i])
        discount = rows.loc[i]['tmp_discount_factor']
        #print(discount)
        rows.at[i,'Charge'] = rows.at[i,'Quantity'] * rows.at[i,'Price'] * discount
        
        tmp.at[i,'Booking start'] = next_split
        delta = tmp.loc[i]['Booking end'] - tmp.loc[i]['Booking start']
        tmp.at[i,'Quantity'] = delta.seconds / 3600
        tmp.at[i,'Price item'] = get_price_item(tmp.loc[i])
        tmp.at[i,'Price'] = get_price(tmp.loc[i])
        discount = tmp.loc[i]['tmp_discount_factor']
        #print(discount)
        tmp.at[i,'Charge'] = tmp.at[i,'Quantity'] * tmp.at[i,'Price'] * discount
        #print("tmp2:")
        #print(tmp[summary])
    
        rows = rows.append(tmp)
        rows.reset_index(inplace=True, drop=True)

        return split_3(rows)
    
def check_totals(dataframe,postfix):
    _df = dataframe
    #print(_df.sum(numeric_only=True, axis=0))
    #print(_df.head()['Price'])
    
    _df['Charge'] = pd.to_numeric(_df['Charge'], errors='raise')
    
    # ungrouped total
    print("ungrouped: " + str(_df.sum(numeric_only=True, axis=0)['Charge']))

    # totals by WBS
    tmp = _df.groupby(['Group','Cost center name'])['Charge'].sum().reset_index()
    tmp.loc['Column_Total']= tmp.sum(numeric_only=True, axis=0)
    tmp.to_excel("test_ " + basename + "_by_group_and_wbs_" + postfix + ext, index=False)
    total_wbs = tmp.loc['Column_Total']['Charge']
    print("grouped by WBS: " + str(total_wbs))

    # totals by instrument
    tmp = _df.groupby(['Resource'])['Charge'].sum().reset_index()
    tmp.loc['Column_Total']= tmp.sum(numeric_only=True, axis=0)
    tmp.to_excel("test_ " + basename + "_by_resource_" + postfix + ext, index=False)
    total_resource = tmp.loc['Column_Total']['Charge']
    print("grouped by resource: " + str(total_resource))

    if total_resource != total_wbs:
        print("Totals don't match.")

## Check totals before any fixes

In [4]:
check_totals(df,'before_fixes')

ungrouped: 45577.66000000005
grouped by WBS: 45577.66
grouped by resource: 45577.66000000001
Totals don't match.


## Test splitting

In [5]:
# test split
i = df[df['ID'] == 11849].index.values[0]
#i = df[df['ID'] == 12954].index.values[0]
dup = pd.DataFrame(df.loc[i]).T
#dup = pd.DataFrame(df[df['ID'] == 11849]).T
print(dup[summary])
print()
dup.index.values[0]
split_3(dup)[summary]

       ID   User name           Resource        Booking start  \
21  11849  Lan, Qiang  Leica SP8 upright  2019-11-19 17:00:00   

            Booking end Price Discount Quantity                  Price item  \
21  2019-11-20 00:15:00    16      nan     7.25  Off-hours price (per hour)   

   Charge          Group Cost center name Cost center code  
21    116  Marja Mikkola    Marja Mikkola         13074212  



Unnamed: 0,ID,User name,Resource,Booking start,Booking end,Price,Discount,Quantity,Price item,Charge,Group,Cost center name,Cost center code
0,11849,"Lan, Qiang",Leica SP8 upright,2019-11-19 17:00:00,2019-11-19 22:00:00,16,,5.0,Off-hours,80,Marja Mikkola,Marja Mikkola,13074212
1,11849,"Lan, Qiang",Leica SP8 upright,2019-11-19 22:00:00,2019-11-20 00:15:00,12,,2.25,Night time,27,Marja Mikkola,Marja Mikkola,13074212


## Add flag columns to indicate particular situations

In [6]:
# add columns that help to find particular situations
df['tmp_needs_split_2'] = df.apply(needs_split_2, axis=1)
df['tmp_needs_split_3'] = df.apply(needs_split_3, axis=1)
df['tmp_is_night_1'] = df.apply(is_night_1, axis=1)
df['tmp_is_night_2'] = df.apply(is_night_2, axis=1)
df['tmp_night_discount_applies'] = df.apply(night_discount_applies, axis=1)
df['tmp_off_hour_discount_applies'] = df.apply(off_hour_discount_applies, axis=1)
df['tmp_includes_holiday_or_weekend'] = df.apply(includes_holiday_or_weekend, axis=1)



## Search problematic cases and save results

In [7]:
def save_test_result(filename,dataframe):
    print(filename + " " + str(dataframe.shape))
    dataframe.to_excel(filename, index=True)

def run_tests(prefix):
    # missing a prime/off/night split
    test = df[df['tmp_needs_split_3']][summary]
    save_test_result(prefix + '_needs_split_3.xlsx', test)

    # missing a prime/off split
    test = df[df['tmp_needs_split_2']][summary]
    save_test_result(prefix + '_needs_split_2.xlsx', test)

    # booking includes both night discount periods
    test = df[df['tmp_is_night_1'] & df['tmp_is_night_2']][summary]
    save_test_result(prefix + '_night1_and_night2.xlsx', test)

    # both night discounts and a missing split
    test = df[df['tmp_is_night_1'] & df['tmp_is_night_2'] & df['tmp_needs_split_3']][summary]
    save_test_result(prefix + '_night1_and_night2_needs_split3.xlsx', test)

    # find regular price bookings during holidays
    test = df[df['tmp_includes_holiday_or_weekend'] & df['Price item'].str.startswith('Regular usage') & df['tmp_off_hour_discount_applies']][summary]
    save_test_result(prefix + '_regular_price_during_holidays.xlsx', test)

    # group or WBS missing
    test = df[(df['Group'] == '') | (df['Cost center name'] == '')][summary]
    save_test_result(prefix + '_group_or_wbs_missing.xlsx', test)

    # discounts
    test = df[df['Discount'] != 'nan'][summary]
    save_test_result(prefix + '_discounts.xlsx', test)

run_tests("test_before_fixes_")

test_before_fixes__needs_split_3.xlsx (54, 13)
test_before_fixes__needs_split_2.xlsx (0, 13)
test_before_fixes__night1_and_night2.xlsx (15, 13)
test_before_fixes__night1_and_night2_needs_split3.xlsx (11, 13)
test_before_fixes__regular_price_during_holidays.xlsx (14, 13)
test_before_fixes__group_or_wbs_missing.xlsx (0, 13)
test_before_fixes__discounts.xlsx (10, 13)


## Split bookings

In [8]:
# bookings with missing splits
index = df[df['tmp_needs_split_3']].index
#print(index)

df['tmp_remove_split_line'] = False

# dataframe for split bookings
splits = pd.DataFrame()
for i in index:
    #print(i)

    y1 = df.loc[i]['Booking start'].year
    m1 = df.loc[i]['Booking start'].month
    d1 = df.loc[i]['Booking start'].day

    #print(df.loc[i]['ID']) 
    # create a new DataFrame from the line to be split
    dup = pd.DataFrame(df.loc[i]).T
    
    # split the line
    split = split_3(dup)
    #print(split[summary])
    #print(split.shape)
    
    # mark original line for removal
    df.at[i,'tmp_remove_split_line'] = True
    
    # append the split line to the dataframe with all splits 
    splits = splits.append(split, ignore_index=True)
    
# append all splits to the original dataframe
df = df.append(splits, ignore_index=True)

# save splits including the original line 
df[df['tmp_needs_split_3'] == True] \
    .sort_values(['ID','Booking start','Quantity'], ascending=[True, True, False])[summary].to_excel("tmp_" + basename + "_fix1_split" + ext, index=True) 

# remove original line and save dataframe
df = df[df['tmp_remove_split_line'] == False]
df[summary].to_excel("tmp_" + basename + "_fix1_split_originals_removed" + ext, index=True) 


## Fix holidays

In [9]:
# fix holidays
idx = df[df['tmp_includes_holiday_or_weekend'] & df['Price item'].str.startswith('Regular usage') & df['tmp_off_hour_discount_applies']].index
for i in idx:
    #print(df.loc[i]['ID']) 
    # by eyeballing the list it seems there is no need for splitting, so it's enough to edit the price
    df.at[i,'Price item'] = 'Off-hours holiday fix'
    #print(df.loc[i][summary].T)
    #print(prices[prices['Instrument'] == df.loc[i]['Resource']])
    #print(prices[prices['Instrument'] == df.loc[i]['Resource']]['Off-hours'])
    df.at[i,'Price'] = prices[prices['Instrument'] == df.loc[i]['Resource']][OFF_HOURS].values[0]
    df.at[i,'Charge'] = df.at[i,'Quantity'] * df.at[i,'Price'] * df.loc[i]['tmp_discount_factor']
    
df.loc[idx][summary].to_excel("tmp_" + basename + "_fix2_holidays" + ext, index=True) 


## Fix night time discounts

In [10]:
# fix night time reservations that didn't need a split but include both discounts
idx = df[df['tmp_is_night_1'] & df['tmp_is_night_2']][summary].index
for i in idx:
    # if booking starts after 22, it's enough to fix price
    if(df.loc[i]['Booking start'].hour >= 22):
        print("fix price and charge for " + str(df.loc[i]['ID']))
        df.at[i,'Price item'] = 'Night price fix'
        df.at[i,'Price'] = prices[prices['Instrument'] == df.loc[i]['Resource']][NIGHT_TIME].values[0]
        df.at[i,'Charge'] = df.at[i,'Quantity'] * df.at[i,'Price'] * df.loc[i]['tmp_discount_factor']
    else:
        print("do nothing, this should have been split " + str(df.loc[i]['ID']))
        
df.loc[idx][summary].to_excel("tmp_" + basename + "_fix3_nights_1_and_2" + ext, index=True) 


fix price and charge for 12021
fix price and charge for 12105
fix price and charge for 13046
fix price and charge for 13622
do nothing, this should have been split 11849
fix price and charge for 11849
do nothing, this should have been split 12290
fix price and charge for 12290
do nothing, this should have been split 12290
do nothing, this should have been split 14254
fix price and charge for 14254
do nothing, this should have been split 12659
fix price and charge for 12659
do nothing, this should have been split 12659
do nothing, this should have been split 12952
fix price and charge for 12952
do nothing, this should have been split 12952
do nothing, this should have been split 12954
fix price and charge for 12954
do nothing, this should have been split 12954
fix price and charge for 12954
do nothing, this should have been split 12954
fix price and charge for 12954
do nothing, this should have been split 12954
do nothing, this should have been split 14108
fix price and charge for 14108

In [11]:
df['tmp_num_issue'] = ~df["Price"].apply(np.isreal)
df[df['tmp_num_issue']==True]

Unnamed: 0,ID,Creation date,Description,Provider,Booking start,Booking end,User email,User name,Group,Group head email(s),...,tmp_discount_factor,tmp_needs_split_2,tmp_needs_split_3,tmp_is_night_1,tmp_is_night_2,tmp_night_discount_applies,tmp_off_hour_discount_applies,tmp_includes_holiday_or_weekend,tmp_remove_split_line,tmp_num_issue


## Save test files after fixes

In [12]:
summary.append('tmp_discount_factor')
run_tests("test_after_fixes_")

test_after_fixes__needs_split_3.xlsx (123, 14)
test_after_fixes__needs_split_2.xlsx (0, 14)
test_after_fixes__night1_and_night2.xlsx (41, 14)
test_after_fixes__night1_and_night2_needs_split3.xlsx (37, 14)
test_after_fixes__regular_price_during_holidays.xlsx (0, 14)
test_after_fixes__group_or_wbs_missing.xlsx (0, 14)
test_after_fixes__discounts.xlsx (10, 14)


## Save results

In [13]:
# remove tmp_ columns
df = df[df.columns.drop(list(df.filter(regex='tmp_')))]

# save full fixed version
df.to_excel(basename + "_fixed" + ext, index=False)

## Check total after fixes

In [14]:
check_totals(df,'after_fixes')

ungrouped: 45035.91663333333
grouped by WBS: 45035.916633333334
grouped by resource: 45035.916633333334
