# README 1st
This notebook should be run cell by cell. The first cell reads the input files. The following cells implement different ways of combining the inputs, and you have to run the appropriate cell depending on what you want to do.

# Input

In [None]:
INVOICE_DIR = '/work/data/OpenIRIS/LMU-20200427'
INVOICE_FILE = 'Invoice24.xlsx' 
FIXES_FILE = 'fixed_Invoice24__price_type_missing.xlsx'

INVOICE_DIR = '/work/data/OpenIRIS/BIU-20200430' 
INVOICE_FILE = 'Invoice16.xlsx' 
FIXES_FILE = 'fixed_Invoice16__overlapping_bookings.xlsx' 
#FIXES_FILE = 'fixed_Invoice16__group_or_wbs_missing.xlsx'

from datetime import datetime
import pandas as pd
from pathlib import Path

INVOICE_DIR = Path(INVOICE_DIR)
if not INVOICE_DIR.is_dir():
    raise ValueError('Please check INVOICE_DIR.')
INVOICE_FILE = INVOICE_DIR / INVOICE_FILE
if not INVOICE_FILE.exists():
    raise ValueError('Please check INVOICE_FILE.')
FIXES_FILE = INVOICE_DIR / FIXES_FILE
if not FIXES_FILE.exists():
    raise ValueError('Please check FIXES_FILE.')

# read first two rows of input
header = pd.read_excel(INVOICE_FILE, nrows=1)

# check if the first two rows are the invoice summary from IRIS
if 'Created by' in header.columns:
    print('input file has IRIS summary, skip it')
    df = pd.read_excel(INVOICE_FILE, skiprows=[0,1])
else:
    print('no IRIS summary, read entire file')
    df = pd.read_excel(INVOICE_FILE)

# backup original
timestamp = str(datetime.now().strftime("%Y%m%d-%H%M%S"))
print(timestamp)
df.to_excel(INVOICE_DIR / (INVOICE_FILE.stem + '__' + timestamp + '.xlsx'), index=False)

df2 = pd.read_excel(FIXES_FILE)

# use the original index in column 'Unnamed: 0'
df2.set_index('Unnamed: 0', inplace=True)
df2.head(3)


# Modifications only (no added rows)

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html?highlight=update#pandas.DataFrame.update
df3 = df.copy()
df3.update(df2)

# save the modifications
OUTPUT = INVOICE_DIR / (INVOICE_FILE.stem + "__update__" + FIXES_FILE.stem + INVOICE_FILE.suffix)
df3.to_excel(OUTPUT,index=False)
print(OUTPUT)

# overwrite original
df3.to_excel(INVOICE_FILE,index=False)


# Append split bookings
- df2 contains rows where parts of split bookings have the original IRIS charge ID.
- Rows that are not split will be ignored.
- df2 may contain fewer columns than df1. Missing columns will be read from df1.


In [None]:
df3 = df.copy()

# find rows in df2 with repeated ID
ids = df2["ID"]
splits = df2[ids.isin(ids[ids.duplicated()])].copy().reset_index()
# read these columns as datetime
splits['Booking start'] =  pd.to_datetime(splits['Booking start'], format='%Y-%m-%d %H:%M')
splits['Booking end'] =  pd.to_datetime(splits['Booking end'], format='%Y-%m-%d %H:%M')


split_IDs = splits["ID"].values
print("IDs of split lines:")
print(split_IDs)
print()

imap = {}
for s in split_IDs:
    imap[s] = int(df3[df3['ID'] == s].index[0])
print("mapping from split line ID to index in original data (df):")
print(imap)
print()

#df[df['ID'].isin(split_IDs)].sort_values("ID")
#df.loc[imap.values()].sort_values("ID")

idx = []
for s in split_IDs:
    idx.append(imap[s])
print("index in original data corresponding to splits (with repeats):")
print(idx)
print()

# all columns of split rows from original data
splitbase = df3.loc[idx].copy().reset_index()
print("splits (rows, cols):")
print(splits.shape)
print("splitbase (rows, cols):")
print(splitbase.shape)
print()

# if there are columns in df2 that are not present in df, add them (e.g. tmp_rebooked)
for c in df2.columns.values:
    if c.startswith('tmp_') and c not in df3.columns.values:
        print('adding column ' + c)
        splitbase[c] = None

# overwrite original with split data
splitbase.update(splits)

# remove unsplit lines from original
df3 = df3[~df3['ID'].isin(split_IDs)]

# append all splits to the original dataframe
df3 = df3.append(splitbase, ignore_index=True)



# save the modifications
OUTPUT = INVOICE_DIR / (INVOICE_FILE.stem + "__append_splits__" + FIXES_FILE.stem + INVOICE_FILE.suffix)
df3.to_excel(OUTPUT,index=False)
print(OUTPUT)

# overwrite original
df3.to_excel(INVOICE_FILE,index=False)



In [None]:
splitbase



In [None]:
splits