In [30]:
import pandas as pd

import os
from pathlib import Path

def read_ebay_csv_files(directory_path):
    # Define the directory path
    ebay_uploads_dir = Path(directory_path)

    # Get all CSV files in the directory
    csv_files = [f for f in os.listdir(ebay_uploads_dir) if f.endswith('.csv')]

    # Read all CSV files into a dictionary of DataFrames
    dfs = {}
    for file in csv_files:
        file_path = ebay_uploads_dir / file
        df = pd.read_csv(file_path)
        if "SU" in file:
            file = file.split(" ")[-1]
        dfs[file.split(".")[0]] = df
    
    return dfs

# Example usage:
dfs_old = read_ebay_csv_files('data/ebay_uploads_old')
dfs_new = read_ebay_csv_files('data/ebay_uploads_new')


In [31]:
for key in dfs_old.keys():
    print(f"\nComparing {key}:")
    df_old = dfs_old[key]
    df_new = dfs_new.get(key)
    
    if df_new is None:
        print(f"No corresponding new dataframe found for {key}")
    else:
        # Check if dataframes are equivalent
        if df_old.equals(df_new):
            print("Dataframes are equivalent")
        else:
            # Compare shapes
            if df_old.shape != df_new.shape:
                print(f"Shapes differ: Old {df_old.shape}, New {df_new.shape}")
            
            # Compare columns
            if list(df_old.columns) != list(df_new.columns):
                print("Columns differ:")
                print(f"Old columns: {list(df_old.columns)}")
                print(f"New columns: {list(df_new.columns)}")
            
            # Compare data
            if df_old.shape == df_new.shape and list(df_old.columns) == list(df_new.columns):
                diff_mask = (df_old != df_new).any(axis=1)
                if diff_mask.any():
                    print(f"Data differs in {diff_mask.sum()} rows")
                    print("First few differing rows:")
                    print(pd.concat([df_old[diff_mask], df_new[diff_mask]]).head(10))
                else:
                    print("Data is equivalent but dataframes are not considered equal (possibly due to metadata)")
    
    print("\n" + "="*50)


Comparing PV:
Shapes differ: Old (22948, 5), New (15523, 5)


Comparing CPO:
Shapes differ: Old (28447, 5), New (19235, 5)


Comparing PG:
Shapes differ: Old (74, 5), New (2, 5)


Comparing AST:
Shapes differ: Old (17301, 5), New (11701, 5)


Comparing AMS:
Shapes differ: Old (999, 5), New (40, 5)


Comparing OR:
Shapes differ: Old (74, 5), New (2, 5)


Comparing SJR:
Shapes differ: Old (23087, 5), New (15648, 5)


Comparing RTG:
Shapes differ: Old (32359, 5), New (20279, 5)


Comparing DPW:
Shapes differ: Old (10393, 5), New (10925, 5)


Comparing MFD:
Shapes differ: Old (22982, 5), New (15549, 5)



In [32]:
# Find ItemIDs in dfs_old['PV'] that are not in dfs_new['PV']
if 'RTG' in dfs_old and 'RTG' in dfs_new:
    old_item_ids = set(dfs_old['RTG']['ItemID'])
    new_item_ids = set(dfs_new['RTG']['ItemID'])
    
    missing_item_ids = old_item_ids - new_item_ids
    
    if missing_item_ids:
        print(f"ItemIDs in dfs_old['RTG'] that are not in dfs_new['RTG']:")
        for item_id in missing_item_ids:
            print(item_id)
        print(f"\nTotal missing ItemIDs: {len(missing_item_ids)}")
    else:
        print("All ItemIDs from dfs_old['RTG'] are present in dfs_new['RTG']")
else:
    print("'RTG' key not found in one or both dataframes")

ItemIDs in dfs_old['RTG'] that are not in dfs_new['RTG']:
175815294981
155320745995
165791072267
155320746006
155271921691
155354791969
165882003493
155322187814
175569141798
175570419752
155320746024
155320746026
155355021350
175569141811
155353251892
155271921718
175570419779
155354792013
155353251923
155320746072
175569141851
155320746077
175569141854
155320746079
155353055325
166645432415
165711708260
175569141861
155354792054
155270250615
155322187899
155320746108
166645432444
155353055358
165083021447
156115402895
175570419860
175570419864
155354726553
155355021466
155353252007
155353252012
165882003632
165882003633
155353055408
155353055409
165881774260
155322187958
175570419895
175570419898
155320746174
155353055424
175570419909
155320746188
153030885585
165882003668
165881774294
165792874712
165880103128
175569141978
175569141979
165882003676
175471427806
165880103136
165882003681
155353252077
155320746222
155354726640
165882003699
165880103155
155354792182
165880103158
155353

In [33]:
# Find ItemIDs in dfs_new['RTG'] that are not in dfs_old['RTG']
if 'RTG' in dfs_old and 'RTG' in dfs_new:
    old_item_ids = set(dfs_old['RTG']['ItemID'])
    new_item_ids = set(dfs_new['RTG']['ItemID'])
    
    new_item_ids_not_in_old = new_item_ids - old_item_ids
    
    if new_item_ids_not_in_old:
        print(f"ItemIDs in dfs_new['RTG'] that are not in dfs_old['RTG']:")
        for item_id in new_item_ids_not_in_old:
            print(item_id)
        print(f"\nTotal new ItemIDs: {len(new_item_ids_not_in_old)}")
    else:
        print("All ItemIDs from dfs_new['RTG'] are present in dfs_old['RTG']")
else:
    print("'RTG' key not found in one or both dataframes")

ItemIDs in dfs_new['RTG'] that are not in dfs_old['RTG']:
156395860360
155535414825
165059544937
175720352239
154601985008
155547604372
175714186518
175720322687

Total new ItemIDs: 8


In [34]:
dfs_old["RTG"]

Unnamed: 0,Action,ItemID,SiteID,Currency,Quantity
0,Revise,172786451944,UK,GBP,1
1,Revise,162597344956,UK,GBP,4
2,Revise,162597455841,UK,GBP,8
3,Revise,172786579222,UK,GBP,2
4,Revise,152630312138,UK,GBP,0
...,...,...,...,...,...
32354,Revise,175812517226,UK,GBP,10
32355,Revise,166222055433,UK,GBP,7
32356,Revise,166222055434,UK,GBP,10
32357,Revise,166269870683,UK,GBP,10


In [35]:
dfs_new["RTG"]

Unnamed: 0,Action,ItemID,SiteID,Currency,Quantity
0,Revise,175223542598,UK,GBP,0
1,Revise,165416958979,UK,GBP,0
2,Revise,175227323562,UK,GBP,0
3,Revise,154752181806,UK,GBP,0
4,Revise,165737537348,UK,GBP,0
...,...,...,...,...,...
20274,Revise,154932323750,UK,GBP,0
20275,Revise,175227337615,UK,GBP,0
20276,Revise,175227337624,UK,GBP,0
20277,Revise,175227337625,UK,GBP,0


In [36]:
ebay_df = pd.read_csv("data/tables/ebay.csv")

In [53]:
ebay_df[ebay_df['quantity_delta'] != 0]["custom_label"].drop_duplicates().to_csv("test.csv",index=False)


In [43]:
ebay_df[ebay_df['custom_label'] == 'CAL138L/R+CBP01610']

Unnamed: 0,item_id,quantity_delta,quantity,custom_label,store
54059,255787900000.0,-20.0,0.0,CAL138L/R+CBP01610,MFD
54060,265952900000.0,-20.0,0.0,CAL138L/R+CBP01610,MFD
54061,255792900000.0,-20.0,0.0,CAL138L/R+CBP01610,MFD
54062,175458700000.0,-20.0,0.0,CAL138L/R+CBP01610,DPW
54063,314201800000.0,-20.0,0.0,CAL138L/R+CBP01610,DPW
54064,175458700000.0,-20.0,0.0,CAL138L/R+CBP01610,PV
54065,185628000000.0,-20.0,0.0,CAL138L/R+CBP01610,PV
54066,204128800000.0,-20.0,0.0,CAL138L/R+CBP01610,PV
54067,115574700000.0,-20.0,0.0,CAL138L/R+CBP01610,CPO
54068,144780700000.0,-20.0,0.0,CAL138L/R+CBP01610,CPO
