# Usage on Windows
- Install Anaconda (in Software Center on UH computers).
- Download this notebook and the invoice from Open IRIS in a folder.
- Start Menu -> Anaconda -> Anaconda shell.
- In the shell: "cd" to the folder with the notebook, e.g. "cd Documents\billing_check".
- In the shell: "jupyter notebook". This chould open the notebook in a browser.
- Change the invoice file name in the first notebook cell.
- Cell -> Run All.
- Wait for .xslx files to appear in the folder.



# Connect to Google Drive from Colaboratory

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

# Price list and invoice
- set invoice file
- set holidays

In [None]:
# '.' works on local computer when the price list is in the notebook folder
PRICE_LIST_DIR = '.' #@param {type:"raw"}

# comment out price list you don't want to use
#PRICE_LIST_FILE = 'price_list_biu.csv' #@param {type:"raw"}
PRICE_LIST_FILE = 'price_list_lmu.csv' #@param {type:"raw"}

#INVOICE_DIR = '/work/data/OpenIRIS/BIU' #@param {type:"raw"}
#INVOICE_FILE = 'Invoice16.xlsx' #@param {type:"raw"}
INVOICE_DIR = '/work/data/OpenIRIS/LMU-20200424' #@param {type:"raw"}
INVOICE_FILE = 'Invoice24__update__fixed_Invoice24__price_type_missing.xlsx' #@param {type:"raw"}
#INVOICE_FILE = 'Invoice24.xlsx' #@param {type:"raw"}


from pathlib import Path

PRICE_LIST_DIR = Path(PRICE_LIST_DIR)
if not PRICE_LIST_DIR.is_dir():
    raise ValueError('Please check PRICE_LIST_DIR.')
PRICE_LIST_FILE = PRICE_LIST_DIR / PRICE_LIST_FILE
if not PRICE_LIST_FILE.exists():
    raise ValueError('Please check PRICE_LIST_FILE.')

INVOICE_DIR = Path(INVOICE_DIR)
if not INVOICE_DIR.is_dir():
    raise ValueError('Please check INVOICE_DIR.')
INVOICE_FILE = INVOICE_DIR / INVOICE_FILE
if not INVOICE_FILE.exists():
    raise ValueError('Please check INVOICE_FILE.')


In [None]:
from datetime import date,datetime,timedelta
import numpy as np
import pandas as pd

# holidays during billing period (have to list manually)
holidays = [\
    date(2019,12,6), \
    date(2019,12,24), \
    date(2019,12,25), \
    date(2019,12,26), \
#    date(2019,12,31), \
    date(2020,1,1), \
    date(2020,1,6) \
           ]

basename = INVOICE_FILE.stem
ext = INVOICE_FILE.suffix

header = pd.read_excel(INVOICE_FILE, nrows=1)
header = header[header.columns.drop(list(header.filter(regex='Unnamed')))]

df = pd.read_excel(INVOICE_FILE, skiprows=[0,1])

# essential columns shown in testing
summary = ['ID','User name','Resource','Booking start','Booking end','Price','Discount', 'Quantity','Price item','Price type','Charge','Group','Cost center name','Cost center code', 'Comments (charge)','Price (detailed)']
summary_short = ['ID','User name','Resource','Booking start','Booking end','Quantity','Charge','Discount', 'Comments (charge)','Group','Cost center name','Cost center code']

# exclude IRIS test instruments
test_instruments = ['Sheep (TESTING AND DEVELOPMENT PURPOSE', 'Super testers practice instrument']
df = df[~df['Resource'].isin(test_instruments)]

# exclude (first save) staff groups
biu_groups = ['BIU staff']
lmu_groups = ['Kimmo Tanhuanpää', 'LMU-staff']
test_groups = ['Group Raimi research inc']
df[df['Group'].isin(biu_groups)][summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__BIU_bookings.xlsx"), index=True)
df[df['Group'].isin(lmu_groups)][summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__LMU_bookings.xlsx"), index=True)

exclude_groups = []
exclude_groups.extend(biu_groups)
exclude_groups.extend(lmu_groups)
exclude_groups.extend(test_groups)
df = df[~df['Group'].isin(exclude_groups)]

# store original report with essential columns
df[summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__summary" + ext), index=True) 


In [None]:
header

# Manual interventions
This cell saves rows that might need manual editing and test cases that are not expected
to be changed by the automated fixes below.

You can make the manual edits in the shorter .xlsx files produced here, and then merge the changes to the original IRIS report using the notebook "merge_reports.ipynb".

In [None]:
PRICE_TYPE = 'Price type'
PRIME_TIME = 'Prime-time'
OFF_HOURS = 'Off-hours'
NIGHT_TIME = 'Night time'

def save_test_result(filename,dataframe):
    print(filename + " " + str(dataframe.shape))
    dataframe.to_excel(INVOICE_DIR / filename, index=True)

# save rows with price type missing
test = df[df[PRICE_TYPE].isnull()]
save_test_result("test_" + basename + "__price_type_missing.xlsx", test[summary])

# group or WBS missing
test = df[(df['Group'].isnull()) | (df['Cost center code'].isnull())][summary_short]
save_test_result("test_" + basename + '__group_or_wbs_missing.xlsx', test)

# save rows with cancellations
test = df[df['Price item'].str.contains("Cancellation")]
save_test_result("test_" + basename + "__cancellations.xlsx", test[summary_short])

# save rows with discounts
test = df[~df['Discount'].isnull()]
save_test_result("test_" + basename + "__discount.xlsx", test[summary_short])



# Read price list

In [None]:
prices = pd.read_csv(PRICE_LIST_FILE, quotechar="'",)

print('Price types in use:')
print(df[PRICE_TYPE].unique())

# Check that prices exist for all instruments
for r in df['Resource'].unique():
    for pt in df[PRICE_TYPE].unique():
        for p in [PRIME_TIME,OFF_HOURS,NIGHT_TIME]:
            try:
                price = prices[(prices['Instrument'] == r) & \
                               (prices[PRICE_TYPE] == pt)][p].values[0]
            except:
                raise ValueError("Price missing: %s / %s / %s" % (r,pt,p))


In [None]:
prices.head()


# Functions and constants

In [None]:
# read these columns as datetime
df['Booking start'] =  pd.to_datetime(df['Booking start'], format='%Y-%m-%d %H:%M')
df['Booking end'] =  pd.to_datetime(df['Booking end'], format='%Y-%m-%d %H:%M')

# billing period
period_start = df['Booking start'].min().date()
period_end = df['Booking end'].max().date()

# weekends during billing period
weekends = []
test = period_start
while test < period_end:
    if test.weekday() >= 5:
        weekends.append(test)
    test = test + timedelta(days=1)
weekends

weekends_and_holidays = sorted(weekends + holidays)

instruments_with_night_discounts = prices[prices[OFF_HOURS] != prices[NIGHT_TIME]]['Instrument'].values
instruments_with_no_discounts = prices[(prices[PRIME_TIME] == prices[OFF_HOURS]) & \
                                             (prices[OFF_HOURS] == prices[NIGHT_TIME])]['Instrument'].values
#print(instruments_with_night_discounts)
#print(instruments_with_no_discounts)


def next_workday(dt):
    #print(weekends_and_holidays)
    test = dt
    while True:
        test = test + timedelta(days=1)
        #print(test.date())
        if test.date() not in weekends_and_holidays:
            return test

# get price type for a booking that has already been split
def get_price_item(row):
    if row['Resource'] in instruments_with_night_discounts:
        if row['Booking start'].hour < 8 or row['Booking start'].hour >= 22:
            return NIGHT_TIME
        if includes_holiday_or_weekend(row) or row['Booking start'].hour == 8 or row['Booking start'].hour >= 17:
            return OFF_HOURS
        else:
            return PRIME_TIME
    else:
        if includes_holiday_or_weekend(row) or row['Booking start'].hour < 9 or row['Booking start'].hour >= 17:
            return OFF_HOURS
        else:
            return PRIME_TIME
        
def get_price(row):
    #print(row[['Resource',PRICE_TYPE]])
    #print(get_price_item(row))
    return prices[(prices['Instrument'] == row['Resource']) & \
                  (prices[PRICE_TYPE] == row[PRICE_TYPE])][get_price_item(row)].values[0]

def get_discount_factor(row):
    discount = row['Discount']
    discount = discount.replace('%','')
    if(discount == 'nan'):
        return 1
    else:
        return 1 - float(discount)/100

df['Discount'] = df['Discount'].astype(str)
df['tmp_discount_factor'] = 1
df['tmp_discount_factor'] = df.apply(get_discount_factor, axis=1)

# splits between prime and off hours (no night discount)
def next_split_2(dt):
    # other to prime time
    if dt.hour >= 17:
        nwd = next_workday(dt)
        return datetime(nwd.year,nwd.month,nwd.day,9,0,0)
    if dt.hour < 9:
        if dt.date() not in weekends_and_holidays:
            return datetime(dt.year,dt.month,dt.day,9,0,0)
        else:
            nwd = next_workday(dt)
            return datetime(nwd.year,nwd.month,nwd.day,9,0,0) 
    # prime to other time
    if dt.hour < 17:
        if dt.date() not in weekends_and_holidays:
            return datetime(dt.year,dt.month,dt.day,17,0,0)
        else:
            nwd = next_workday(dt)
            return datetime(nwd.year,nwd.month,nwd.day,9,0,0) 
    
# splits between prime, off and night hours 
def next_split_3(dt):
    # night to other time
    if dt.hour >= 22:
        return datetime(dt.year,dt.month,dt.day,8,0,0) + timedelta(days=1)
    if dt.hour < 8:
        return datetime(dt.year,dt.month,dt.day,8,0,0)
    # other to prime time (or night time during holidays)
    if dt.hour < 9:
        if (dt.date() in weekends_and_holidays):
            return datetime(dt.year,dt.month,dt.day,22,0,0)
        else:
            return datetime(dt.year,dt.month,dt.day,9,0,0)
    # prime to other time (or night time during holidays)
    if dt.hour < 17:
        if (dt.date() in weekends_and_holidays):
            return datetime(dt.year,dt.month,dt.day,22,0,0)
        else:
            return datetime(dt.year,dt.month,dt.day,17,0,0)
    # other to night time
    if dt.hour < 22:
        return datetime(dt.year,dt.month,dt.day,22,0,0)
    
# does a row need prime/off split
def needs_split_2(row):
    if row['Resource'] in np.append(instruments_with_night_discounts,instruments_with_no_discounts):
        return False
    if next_split_2(row['Booking start']) < row['Booking end']:
        #print('start ' + str(row['Booking start']))
        #print('next ' + str(next_split_2(row['Booking start'])))
        #print('end ' + str(row['Booking end']))
        #print("yes")
        return True
    else:
        return False

# does a row need prime/off/night split
def needs_split_3(row):
    if not row['Resource'] in instruments_with_night_discounts:
        return False
    if next_split_3(row['Booking start']) < row['Booking end']:
        #print('start ' + str(row['Booking start']))
        #print('next ' + str(next_split_3(row['Booking start'])))
        #print('end ' + str(row['Booking end']))
        #print("yes")
        return True
    else:
        return False

def night_discount_applies(row):
    if row['Resource'] in instruments_with_night_discounts:
        return True
    else:
        return False

def off_hour_discount_applies(row):
    if row['Resource'] in instruments_with_no_discounts:
        return False
    else:
        return True
    
def is_night_1(row):
    if not night_discount_applies(row):
        return False
    #print(row.head())
    dt1 = row['Booking start']
    dt2 = row['Booking end']
    if dt2 > datetime(dt1.year, dt1.month, dt1.day, 22, 0, 0):
        return True
    else:
        return False

def is_night_2(row):
    if not night_discount_applies(row):
        return False
    #print(row.head())
    dt1 = row['Booking start']
    dt2 = row['Booking end']
    if (dt1.day < dt2.day) or (dt1.hour < 8):
        return True
    else:
        return False

def includes_holiday_or_weekend(row):
    d1 = row['Booking start'].date()
    d2 = row['Booking end'].date()

    for dt in weekends_and_holidays:
        if (d1 <= dt) and (d2 >= dt):
            return True
        
    return False

def split_3(rows):
    _summary = ['ID','Booking start','Booking end','Price']

    # make a copy, just to be sure
    rows = rows.copy()
    #print("rows:")
    #print(rows[_summary])
    
    rows['Booking start'] =  pd.to_datetime(rows['Booking start'], format='%Y-%m-%d %H:%M')
    rows['Booking end'] =  pd.to_datetime(rows['Booking end'], format='%Y-%m-%d %H:%M')

    tmp = rows.tail(1).copy()
    #print("tmp:")
    #print(tmp[_summary])
    tmp['Booking start'] =  pd.to_datetime(tmp['Booking start'], format='%Y-%m-%d %H:%M')
    tmp['Booking end'] =  pd.to_datetime(tmp['Booking end'], format='%Y-%m-%d %H:%M')
    i = tmp.index.values[0]
    #print(i)
    next_split = next_split_3(tmp.loc[i]['Booking start'])
    
    if tmp.loc[i]['Booking end'] <= next_split:
        #print("return split")
        return rows
    
    else:
        rows.at[i,'Booking end'] = next_split
        delta = rows.loc[i]['Booking end'] - rows.loc[i]['Booking start']
        rows.at[i,'Quantity'] = delta.seconds / 3600
        rows.at[i,'Price item'] = get_price_item(rows.loc[i])
        rows.at[i,'Price'] = get_price(rows.loc[i])
        discount = rows.loc[i]['tmp_discount_factor']
        #print(discount)
        rows.at[i,'Charge'] = rows.at[i,'Quantity'] * rows.at[i,'Price'] * discount
        
        tmp.at[i,'Booking start'] = next_split
        delta = tmp.loc[i]['Booking end'] - tmp.loc[i]['Booking start']
        tmp.at[i,'Quantity'] = delta.seconds / 3600
        tmp.at[i,'Price item'] = get_price_item(tmp.loc[i])
        tmp.at[i,'Price'] = get_price(tmp.loc[i])
        discount = tmp.loc[i]['tmp_discount_factor']
        #print(discount)
        tmp.at[i,'Charge'] = tmp.at[i,'Quantity'] * tmp.at[i,'Price'] * discount
        #print("tmp2:")
        #print(tmp[summary])
    
        rows = rows.append(tmp)
        rows.reset_index(inplace=True, drop=True)

        return split_3(rows)
    
def check_totals(dataframe,tag):
    _df = dataframe
    #print(_df.sum(numeric_only=True, axis=0))
    #print(_df.head()['Price'])
    
    _df['Charge'] = pd.to_numeric(_df['Charge'], errors='raise')
    #_df['Charge'] = _df['Charge'].round(2)
    
    # ungrouped total
    print("ungrouped: " + str(_df.sum(numeric_only=True, axis=0)['Charge']))

    # totals by WBS
    tmp = _df.groupby(['Group','Cost center name'])['Charge'].sum().reset_index()
    tmp.loc['Column_Total']= tmp.sum(numeric_only=True, axis=0)
    tmp.to_excel(INVOICE_DIR / ("test_ " + basename + "__totals_by_group_and_wbs_" + tag + ext), index=False)
    total_wbs = round(tmp.loc['Column_Total']['Charge'],2)
    print("grouped by WBS: " + str(total_wbs))

    # totals by instrument
    tmp = _df.groupby(['Resource'])['Charge'].sum().reset_index()
    tmp.loc['Column_Total']= tmp.sum(numeric_only=True, axis=0)
    tmp.to_excel(INVOICE_DIR / ("test_ " + basename + "__totals_by_resource_" + tag  + ext), index=False)
    total_resource = round(tmp.loc['Column_Total']['Charge'],2)
    print("grouped by resource: " + str(total_resource))

    if total_resource != total_wbs:
        print("Totals don't match.")
        return "Total don't match"
        
    return total_wbs

# Check totals before any fixes

In [None]:
check_totals(df,'before_fixes')

# Apply price list
Overwrite IRIS prices with the .csv price list. Recalculate charges assuming that bookings are split (apply price item based on start time). 

In [None]:
# store original price and charge in a new columns
df['tmp_price_item_iris'] = df['Price item']
df['tmp_price_iris'] = df['Price']
df['tmp_quantity_iris'] = df['Quantity']
df['tmp_charge_iris'] = df['Charge']

df['Price item'] = df.apply(get_price_item, axis=1)
df['Price'] = df.apply(get_price, axis=1)

## Recalculating durations and charges may be useful if merging data with manually added rows.

def calculate_duration(row):
    delta = row['Booking end'] - row['Booking start']
    return round(delta.total_seconds() / 3600, 4)

df['Quantity'] = df.apply(calculate_duration, axis=1)

def calculate_charge(row):
    return round(row['Quantity'] * row['Price'] * row['tmp_discount_factor'], 2)

df['Charge'] = df.apply(calculate_charge, axis=1)


check_totals(df,'after_recalculate_charges')

# save training bookings
cols = summary_short.copy()
cols.append('tmp_price_item_iris')
test = df[df['tmp_price_item_iris'].str.contains("Training")]
save_test_result("test_" + basename + "__trainings.xlsx", test[cols])

# save rows where cduration changes
cols = summary_short.copy()
cols.append('tmp_quantity_iris')
test = df[df['Quantity'] != df['tmp_quantity_iris']]
save_test_result("test_" + basename + "__duration_changed.xlsx", test[cols])

# save rows where charge changes
cols = summary_short.copy()
cols.append('tmp_charge_iris')
test = df[df['Charge'] != df['tmp_charge_iris']]
save_test_result("test_" + basename + "__charge_changed.xlsx", test[cols])


# Test splitting

In [None]:
SPLIT_TEST_INDEX = -1
#SPLIT_TEST_INDEX = 11849

if SPLIT_TEST_INDEX != -1:
    # test split
    i = df[df['ID'] == 11849].index.values[0]
    #i = df[df['ID'] == 12954].index.values[0]
    dup = pd.DataFrame(df.loc[i]).T
    #dup = pd.DataFrame(df[df['ID'] == 11849]).T
    print(dup[summary])
    print()
    dup.index.values[0]
    split_3(dup)[summary]

# Add flag columns to indicate particular situations

In [None]:
# add columns that help to find particular situations
df['tmp_needs_split_2'] = df.apply(needs_split_2, axis=1)
df['tmp_needs_split_3'] = df.apply(needs_split_3, axis=1)
df['tmp_is_night_1'] = df.apply(is_night_1, axis=1)
df['tmp_is_night_2'] = df.apply(is_night_2, axis=1)
df['tmp_night_discount_applies'] = df.apply(night_discount_applies, axis=1)
df['tmp_off_hour_discount_applies'] = df.apply(off_hour_discount_applies, axis=1)
df['tmp_includes_holiday_or_weekend'] = df.apply(includes_holiday_or_weekend, axis=1)



# Search problematic cases and save results

In [None]:
def run_tests(tag):
    prefix = "test_" + basename + "_" + tag + "_"
    # missing a prime/off/night split
    test = df[df['tmp_needs_split_3']][summary]
    save_test_result(prefix + '_needs_split_3.xlsx', test)

    # missing a prime/off split
    test = df[df['tmp_needs_split_2']][summary]
    save_test_result(prefix + '_needs_split_2.xlsx', test)

    # booking includes both night discount periods
    test = df[df['tmp_is_night_1'] & df['tmp_is_night_2']][summary]
    save_test_result(prefix + '_night1_and_night2.xlsx', test)

    # both night discounts and a missing split
    test = df[df['tmp_is_night_1'] & df['tmp_is_night_2'] & df['tmp_needs_split_3']][summary]
    save_test_result(prefix + '_night1_and_night2_needs_split3.xlsx', test)

    # find regular price bookings during holidays
    test = df[df['tmp_includes_holiday_or_weekend'] & df['Price item'].str.startswith('Regular usage') & df['tmp_off_hour_discount_applies']][summary]
    save_test_result(prefix + '_regular_price_during_holidays.xlsx', test)


run_tests("_before_fixes")

## Split bookings

In [None]:
# bookings with missing splits
index = df[df['tmp_needs_split_3']].index
#print(index)

df['tmp_remove_split_line'] = False

# dataframe for split bookings
splits = pd.DataFrame()
for i in index:
    #print(i)

    y1 = df.loc[i]['Booking start'].year
    m1 = df.loc[i]['Booking start'].month
    d1 = df.loc[i]['Booking start'].day

    #print(df.loc[i]['ID']) 
    # create a new DataFrame from the line to be split
    dup = pd.DataFrame(df.loc[i]).T
    
    # split the line
    split = split_3(dup)
    #print(split[summary])
    #print(split.shape)
    
    # mark original line for removal
    df.at[i,'tmp_remove_split_line'] = True
    
    # append the split line to the dataframe with all splits 
    splits = splits.append(split, ignore_index=True)
    
# append all splits to the original dataframe
df = df.append(splits, ignore_index=True)

# save splits including the original line 
df[df['tmp_needs_split_3'] == True] \
    .sort_values(['ID','Booking start','Quantity'], ascending=[True, True, False])[summary] \
    .to_excel(INVOICE_DIR / ("tmp_" + basename + "__fix1_split" + ext), index=True) 

# remove original line and save dataframe
df = df[df['tmp_remove_split_line'] == False]
df[summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__fix1_split_originals_removed" + ext), index=True) 


## Fix holidays

In [None]:
# fix holidays
idx = df[df['tmp_includes_holiday_or_weekend'] & df['Price item'].str.startswith('Regular usage') & df['tmp_off_hour_discount_applies']].index
for i in idx:
    #print(df.loc[i]['ID']) 
    # by eyeballing the list it seems there is no need for splitting, so it's enough to edit the price
    df.at[i,'Price item'] = 'Off-hours holiday fix'
    #print(df.loc[i][summary].T)
    #print(prices[prices['Instrument'] == df.loc[i]['Resource']])
    #print(prices[prices['Instrument'] == df.loc[i]['Resource']]['Off-hours'])
    df.at[i,'Price'] = prices[(prices['Instrument'] == df.loc[i]['Resource']) & \
                              (prices[PRICE_TYPE] == df.loc[i][PRICE_TYPE])][OFF_HOURS].values[0]
    df.at[i,'Charge'] = df.at[i,'Quantity'] * df.at[i,'Price'] * df.loc[i]['tmp_discount_factor']
    
df.loc[idx][summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__fix2_holidays" + ext), index=True) 


## Fix night time discounts

In [None]:
# fix night time reservations that didn't need a split but include both discounts
idx = df[df['tmp_is_night_1'] & df['tmp_is_night_2'] & ~df['tmp_needs_split_3']][summary].index
for i in idx:
    # if booking starts after 22, it's enough to fix price
    if(df.loc[i]['Booking start'].hour >= 22):
        print("fix price and charge for " + str(df.loc[i]['ID']))
        df.at[i,'Price item'] = 'Night price fix'
        df.at[i,'Price'] = prices[(prices['Instrument'] == df.loc[i]['Resource']) & \
                                  (prices[PRICE_TYPE] == df.loc[i][PRICE_TYPE])][NIGHT_TIME].values[0]
        df.at[i,'Charge'] = df.at[i,'Quantity'] * df.at[i,'Price'] * df.loc[i]['tmp_discount_factor']
    else:
        print("do nothing, this should have been split " + str(df.loc[i]['ID']))
        
df.loc[idx][summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__fix3_nights_1_and_2" + ext), index=True) 


In [None]:
df['tmp_num_issue'] = ~df["Price"].apply(np.isreal)
df[df['tmp_num_issue']==True]

# Find overlapping bookings 

In [None]:
INTERVAL = 'tmp_booking_interval'
OVERLAPS = 'tmp_overlapping_bookings'
HAS_OVERLAPS = 'tmp_has_overlapping_bookings'

df[OVERLAPS] = ''
df[HAS_OVERLAPS] = False

def booking_interval(row):
    return pd.Interval(left=row['Booking start'], right=row['Booking end'])

def overlaps_another_booking(row, testrow):
    overlaps = row[OVERLAPS]
    if (row['Resource'] == testrow['Resource']) \
        & (row['ID'] != testrow['ID']) \
        & (row[INTERVAL].overlaps(testrow[INTERVAL])):
        overlaps = overlaps + ',' + str(testrow['ID'])    
    return overlaps

def find_overlaps(ia):
    overlaps = []
    
    tuples = ia.to_tuples()
    #print(tuples)
    for i in range(1,len(tuples)-1):
        if tuples[i][0] < tuples[i-1][1]:
            #print(tuples[i-1])
            #print(tuples[i])
            overlaps.append(pd.Interval(*tuples[i-1]))
            overlaps.append(pd.Interval(*tuples[i]))
    
    return overlaps

df[INTERVAL] = df.apply(booking_interval, axis=1)

# Sort bookings by resource and booking period. This allows to use IntervalArray.is_non_overlapping_monotonic below.
df = df.sort_values(['Resource','Booking start','Booking end'], ascending=[True, True, True])

intervals = {}
for r in df['Resource'].unique():
    dfr = df[df['Resource'] == r]
    ia = pd.arrays.IntervalArray(dfr[INTERVAL].values)
    intervals[r] = ia
    
    if not ia.is_non_overlapping_monotonic:
        print(r + " has overlapping bookings")
        
        overlaps = find_overlaps(ia)
        #print(overlaps)
        
        idx = dfr[dfr[INTERVAL].isin(overlaps)].index
        for i in idx:
            df.at[i, HAS_OVERLAPS] = True
            #print(i)

# save bookings with overlaps
df[df[HAS_OVERLAPS] == True][summary].to_excel(INVOICE_DIR / ("test_" + basename + "__overlapping_bookings.xlsx"), index=True)


### work on a copy containing only bookings with overlaps
dfo = df[df[HAS_OVERLAPS] == True].copy()

# loop over bookings that have overlaps
summary.append(OVERLAPS)
for i in dfo.index:
    c = dfo.loc[i]
    #print(c[INTERVAL])
    # find the overlapping bookings: this would take a long time to run on the whole dataframe
    dfo[OVERLAPS] = dfo.apply(overlaps_another_booking, testrow=c, axis=1)
dfo[summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__dfo.xlsx"), index=True)


### continue on the complete dataframe
# copy overlaps to complete dataframe (magically the indexing works)
df[OVERLAPS] = dfo[OVERLAPS]
df[summary].to_excel(INVOICE_DIR / ("tmp_" + basename + "__dfo2.xlsx"), index=True)


# test only with lauri
#df = df[df['User email'].str.contains('lauri')]

#for i in df.index:
#idx = df[df[HAS_OVERLAPS].index
#for i in idx:
#    c = df.loc[i]
#    print(c[INTERVAL])
#    df[OVERLAPS] = df.apply(overlaps_another_booking, testrow=c, axis=1)

#summary.append(OVERLAPS)
#df[df[OVERLAPS] != ''][summary]
#df[df[OVERLAPS] != ''][summary].to_excel(os.path.join(INVOICE_DIR,"test_" + basename + "__overlapping_bookings2.xlsx"), index=True)


In [None]:
dup

## Save test files after fixes

In [None]:
df['tmp_needs_split_3'] = df.apply(needs_split_3, axis=1)

summary.append('tmp_discount_factor')
run_tests("_after_fixes")

## Check total after fixes

In [None]:
totals_wbs = check_totals(df,'after_fixes')

## Save results

In [None]:
# remove tmp_ columns
df = df[df.columns.drop(list(df.filter(regex='tmp_')))]

# sort by original ID
df = df.sort_values(['ID','Booking start','Quantity'], ascending=[True, True, False])

# save full fixed version
df.to_excel(INVOICE_DIR / (basename + "_fixed" + ext), index=False)

# fix header total
header['Total'] = str(totals_wbs) + " EUR"

# save header and data as .csv
header.to_csv(INVOICE_DIR / (basename + "_fixed_header.csv"), index=False)
df.to_csv(INVOICE_DIR / (basename + "_fixed_header.csv"), mode='a', index=False)


In [None]:
header
