In [None]:
import pandas as pd
from pathlib import Path

# invoice attachments that were already sent
ERITTELYT1 = '/work/data/Laskutuksen tilastointia/laskutus/laskutus 201125/erittelyt'
# invoice attachments we want to compare the sent ones with
ERITTELYT2 = '/work/data/OpenIRIS/LMU-20210113/erittelyt'
# store difference here
ERITTELYT_DIFF = ERITTELYT2 + '_diff'
# prepend diff to these attachements (next billing cycle)
ERITTELYT3 = '/work/data/OpenIRIS/LMU-20210113/erittelyt'
ERITTELYT_DIFF_ADDED = ERITTELYT3 + '_diff_added'

ERITTELYT1 = Path(ERITTELYT1)
if not ERITTELYT1.is_dir():
    raise ValueError('Please check ERITTELYT1.')
ERITTELYT2 = Path(ERITTELYT2)
if not ERITTELYT2.is_dir():
    raise ValueError('Please check ERITTELYT2.')
ERITTELYT3 = Path(ERITTELYT3)
if not ERITTELYT3.is_dir():
    raise ValueError('Please check ERITTELYT3.')

ERITTELYT_DIFF = Path(ERITTELYT_DIFF)
ERITTELYT_DIFF.mkdir(exist_ok=True)
ERITTELYT_DIFF_ADDED = Path(ERITTELYT_DIFF_ADDED)
ERITTELYT_DIFF_ADDED.mkdir(exist_ok=True)

In [None]:
def create_diff(df1, df2):
    diff = df1.merge(df2, on=['Resource/Product','Booking start'], how='outer', suffixes=['_',''], indicator=True)
    
    # lines that appear in both dataframes are not interesting
    #diff = diff[diff['_merge']!='both']
    
    # lines that appear only in df1 are a problem (we expect df2 to have all lines)
    left_only = diff[diff['_merge']=='left_only']
    
    # lines that appear only in df2 are the expected missing lines
    right_only = diff[diff['_merge']=='right_only']
    # drop extra columns
    cols = [c for c in df2.columns if c[-1] != '_']
    right_only = right_only[cols]
    
    return (left_only,right_only)
    

In [None]:
origs = []
comps = []
diffs = []
lefts = []
totals = []
for f in ERITTELYT1.glob("*.xlsx"):
    print(str(f))
    
    summary = pd.read_excel(f, nrows=10)
    summary_total = summary['Unnamed: 3'][9]
    
    df1 = pd.read_excel(f, skiprows=13)
        
    # initialize df2 with empty dataframe. without this, if file is not found in erittelyt2, invoices with diffent wbs will be compared.
    df2 = pd.DataFrame(columns=['Booking start','Resource/Product','User name'])
    try:
        df2 = pd.read_excel(ERITTELYT2 / f.name, skiprows=13)
    except:
        print("WARNING: file " + str(ERITTELYT2 / f.name) + " not found.")
  
    # remove lines without resource name to get rid of the charge sum field
    total = df1[df1['Resource/Product'].isnull()]
    df1 = df1[~df1['Resource/Product'].isnull()]
    df2 = df2[~df2['Resource/Product'].isnull()]
    
    # compare summary total with charge column sum
    charge_total = df1.Charge.sum()
    #if charge_total != summary_total:
    if abs(charge_total - summary_total) > 0.5:
        print('INFO: summary total %d does not match charge column sum %d' % (summary_total, charge_total))

    (left_only, right_only) = create_diff(df1,df2)
    if left_only.shape[0] > 0:
        print('WARNING: file ' + str(f) +  ' contains ' + str(left_only.shape[0]) + ' lines missing in df2.')
        print(left_only[['Booking start','Resource/Product','User name_']])
        
    # save diff as .xlsx
    if right_only.shape[0] > 0:
        print('INFO: Saving diff (' + str(right_only.shape[0]) + ' lines) in ' + str(ERITTELYT_DIFF / f.name))
        right_only.to_excel(ERITTELYT_DIFF / f.name, index=False)
    
    origs.append(df1)
    comps.append(df2)
    diffs.append(right_only)
    lefts.append(left_only)
    totals.append(total)
    
total_orig = pd.concat(origs)
total_orig.to_excel(ERITTELYT_DIFF / 'total_original_files.xlsx', index=False)
total_comp = pd.concat(comps)
total_comp.to_excel(ERITTELYT_DIFF / 'total_compared_files.xlsx', index=False)
total_diff = pd.concat(diffs)
total_diff.to_excel(ERITTELYT_DIFF / 'total_only_in_compared_files.xlsx', index=False)
total_left = pd.concat(lefts)
total_left.to_excel(ERITTELYT_DIFF / 'total_only_in_original_files.xlsx', index=False)
total_totals = pd.concat(totals)
total_totals.to_excel(ERITTELYT_DIFF / 'total_charge_sum_lines_in_original_files.xlsx', index=False)

print()
print('Original total (invoiced):')
print(str(total_orig.shape[0]) + ' lines')
print(str(total_orig.Charge.sum()) + ' EUR')
print()
print('Original compared files:')
print(str(total_comp.shape[0]) + ' lines')
print(str(total_comp.Charge.sum()) + ' EUR')
print()
print('Missing from original:')
print(str(total_diff.shape[0]) + ' lines')
print(str(total_diff.Charge.sum()) + ' EUR')
print()
print('Only in original:')
print(str(total_left.shape[0]) + ' lines')
print(str(total_left.Charge.sum()) + ' EUR')
print()


In [None]:
ns = pd.read_excel(ERITTELYT2 / '..' / 'Invoice28_fixed_new_summary.xlsx')

In [None]:
(lo,ro) = create_diff(total_comp,ns)

In [None]:
ro

In [None]:
ro.Charge.sum()

In [None]:
total_comp.Charge.sum()

In [None]:
ro.Charge.sum() + total_comp.Charge.sum()

In [None]:
pl = pd.read_excel(ERITTELYT1 / 'Pekka Lappalainen 4705074.xlsx', nrows=10)
pl

In [None]:
pl['Unnamed: 3'][9]

In [None]:
for f in ERITTELYT_DIFF.glob("*.xlsx"):
    print(f)
    
    # diffs don't have headers, so don't skip rows
    df_diff = pd.read_excel(f)
        
    try:
        df2 = pd.read_excel(ERITTELYT3 / f.name, skiprows=13)
    except:
        print("WARNING: file " + str(ERITTELYT3 / f.name) + " not found.")
        continue

    df_diff_added = pd.concat([df_diff, df2])
    df_diff_added.to_excel(ERITTELYT_DIFF_ADDED / f.name, index=False)