In [None]:
import pandas as pd
from pathlib import Path

# invoice attachments that were already sent
ERITTELYT1 = '/work/data/Laskutuksen tilastointia/laskutus/laskutus 201125/erittelyt'
# invoice attachments we want to compare the sent ones with
ERITTELYT2 = '/work/data/OpenIRIS/LMU-20210112/erittelyt'
# store difference here
ERITTELYT_DIFF = '/work/data/OpenIRIS/LMU-20210112/erittelyt_diff'

ERITTELYT1 = Path(ERITTELYT1)
if not ERITTELYT1.is_dir():
    raise ValueError('Please check ERITTELYT1.')
    
ERITTELYT2 = Path(ERITTELYT2)
if not ERITTELYT2.is_dir():
    raise ValueError('Please check ERITTELYT_DIFF.')

ERITTELYT_DIFF = Path(ERITTELYT_DIFF)
ERITTELYT_DIFF.mkdir(exist_ok=True)


In [None]:
def create_diff(df1, df2):
    diff = df1.merge(df2, on=['Resource/Product','Booking start'], how='outer', suffixes=['_',''], indicator=True)
    
    # lines that appear in both dataframes are not interesting
    #diff = diff[diff['_merge']!='both']
    
    # lines that appear only in df1 are a problem (we expect df2 to have all lines)
    left_only = diff[diff['_merge']=='left_only']
    
    # lines that appear only in df2 are the expected missing lines
    right_only = diff[diff['_merge']=='right_only']
    # drop extra columns
    cols = [c for c in df2.columns if c[-1] != '_']
    right_only = right_only[cols]
    
    return (left_only,right_only)
    


In [None]:
diffs = []
for f in ERITTELYT1.glob("*.xlsx"):
    df1 = pd.read_excel(f, skiprows=13)
    try:
        df2 = pd.read_excel(ERITTELYT2 / f.name, skiprows=13)
    except:
        print("WARNING: file " + str(ERITTELYT2 / f.name) + " not found.")
        
    (left_only, right_only) = create_diff(df1,df2)
    if left_only.shape[0] > 0:
        print('WARNING: file ' + str(ERITTELYT2 / f.name) +  ' (df2) is missing ' + str(left_only.shape[0]) + ' lines.')
        print(left_only[['Booking start','Resource/Product','User name_']])
        
    # save diff as .xlsx
    if right_only.shape[0] > 0:
        right_only.to_excel(ERITTELYT_DIFF / f.name, index=False)
    
    diffs.append(right_only)
    
total_diff = pd.concat(diffs)
print(total_diff.shape)
print(total_diff.Charge.sum())


In [None]:
right_only.head()

In [None]:
left_only.head()