In [1]:
import pandas as pd
import glob

def load_salescate_data(data_folder):
    # Get the path to all files containing 'salescate' and ending with '.xlsx'
    salescate_files = glob.glob(data_folder + "/salebycategory*.xlsx")
    
    # If no files are found, raise a FileNotFoundError
    if not salescate_files:
        raise FileNotFoundError(f"No 'salebycategory*.xlsx' files found in {data_folder}")
    
    # Load all the salescate data into a list of pandas dataframes
    salescate_dfs = [pd.read_excel(file) for file in salescate_files]
    
    # Concatenate all the dataframes into one
    salescate_df = pd.concat(salescate_dfs, ignore_index=True)
    
    return salescate_df

# Define the data folder
data_folder1 = r'C:\Users\PC\Dropbox\Projects\data_xls\salebycategory'
data_folder2 = r'C:\Users\trieu.pham\Dropbox\Projects\data_xls\salebycategory'
data_folder3 = r'/Users/trieupham/Dropbox/Projects/data_xls/salebycategory'

try:
    df = load_salescate_data(data_folder1)
except FileNotFoundError:
    try:
        df = load_salescate_data(data_folder2)
    except FileNotFoundError:
        try:
            df = load_salescate_data(data_folder3)
        except FileNotFoundError:
            print("Data folder not found")
            raise
print(df.head())

         barcode                               itemName  divisionCode  \
0  2010101000005                         Ly đá vừa x ly            25   
1  2501031190411                         Ly đá lớn x ly            25   
2  2501031190510                  Trà tắc ly lớn x 1 ly            25   
3  8934588873553           TEA+ Trà Oolong 450ml x chai             7   
4  8934588063053  AQUAFINA Nước tinh khiết 500ml x chai             7   

  divisionName  categoryCode        categoryName  subCategoryCode  \
0  FF Beverage          2506  FF Beverage Onsite           250601   
1  FF Beverage          2506  FF Beverage Onsite           250601   
2  FF Beverage          2506  FF Beverage Onsite           250603   
3     Beverage           703                 Tea            70301   
4     Beverage           701       Bottled Water            70101   

     subCategoryName  itemQty  grossSales  itemDiscount  netSales  \
0         Cup Of Ice      666     1029000        969000    952845   
1       

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18368 entries, 0 to 18367
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   barcode          18368 non-null  int64  
 1   itemName         18368 non-null  object 
 2   divisionCode     18368 non-null  int64  
 3   divisionName     18368 non-null  object 
 4   categoryCode     18368 non-null  int64  
 5   categoryName     18368 non-null  object 
 6   subCategoryCode  18368 non-null  int64  
 7   subCategoryName  18368 non-null  object 
 8   itemQty          18368 non-null  int64  
 9   grossSales       18368 non-null  int64  
 10  itemDiscount     18368 non-null  int64  
 11  netSales         18368 non-null  int64  
 12  totalCostPrice   18368 non-null  float64
dtypes: float64(1), int64(8), object(4)
memory usage: 1.8+ MB


In [3]:
# print all the rows which have barcode = '2010101000005

df[df['barcode'] == 8936114080084]


Unnamed: 0,barcode,itemName,divisionCode,divisionName,categoryCode,categoryName,subCategoryCode,subCategoryName,itemQty,grossSales,itemDiscount,netSales,totalCostPrice
2691,8936114080084,DBMINT Kẹo cao su bạc hà 40 viên x1 1 Hủ,9,Candy,903,Gum,90301,Gum Sub,5,171000,0,158332,106920.0
4903,8936114080084,Doublemint Kẹo Cao Su Bạc Hà 40 Viên x Hủ,9,Candy,903,Gum,90301,Gum Sub,2,66000,0,61112,42768.0
6956,8936114080084,Doublemint Kẹo Cao Su Bạc Hà 40 Viên x Hủ,9,Candy,903,Gum,90301,Gum Sub,2,66000,0,61112,42768.0
9083,8936114080084,Doublemint Kẹo Cao Su Bạc Hà 40 Viên x Hủ,9,Candy,903,Gum,90301,Gum Sub,1,33000,0,30556,21384.0
10266,8936114080084,DBMINT Kẹo cao su bạc hà 40 viên x1 1 Hủ,9,Candy,903,Gum,90301,Gum Sub,4,132000,0,122224,85536.0
12333,8936114080084,Doublemint Kẹo Cao Su Bạc Hà 40 Viên x Hủ,9,Candy,903,Gum,90301,Gum Sub,2,66000,0,61112,42768.0
14146,8936114080084,Doublemint Kẹo Cao Su Bạc Hà 40 Viên x Hủ,9,Candy,903,Gum,90301,Gum Sub,2,66000,0,61112,42768.0
15782,8936114080084,Doublemint Kẹo Cao Su Bạc Hà 40 Viên x Hủ,9,Candy,903,Gum,90301,Gum Sub,3,99000,0,91668,64152.0


In [4]:
# List out all barcodes have more than one row in dataframe, print out the barcode and the number of rows for each barcode, and itemName of each barcode
df['barcode'].value_counts()[df['barcode'].value_counts() > 1]


barcode
2010101000005    10
8934680040136    10
8934673575348    10
8936079121761    10
8934755014314    10
                 ..
702020178867      2
2804065456660     2
8992222051286     2
8936110050180     2
8999999572716     2
Name: count, Length: 2736, dtype: int64

In [5]:
df_grouped = df.groupby(['barcode', 'itemName', 'divisionCode', 'divisionName', 'categoryCode', 'categoryName', 'subCategoryCode', 'subCategoryName']).agg({
    'itemQty': 'sum',
    'grossSales': 'sum',
    'itemDiscount': 'sum',
    'netSales': 'sum',
    'totalCostPrice': 'sum'
}).reset_index()

df_grouped

Unnamed: 0,barcode,itemName,divisionCode,divisionName,categoryCode,categoryName,subCategoryCode,subCategoryName,itemQty,grossSales,itemDiscount,netSales,totalCostPrice
0,21202010,HH Cam Úc 1 trái,29,Fresh Grocery,2901,Fruit Cat,290101,Fruit Sub,7,280000,0,266665,192500.0
1,23133289,GPS Thạch hồng sâm Innerset 15g x gói,7,Beverage,708,Others Beverage,70801,Condition/Supplements,51,1479000,0,1369447,904092.3
2,40677767,Jagermeister 200ml x Chai,14,Wine&Liquor,1401,Liquor,140102,Cordials/Brandy/Cognac,1,293000,0,266364,189000.0
3,40677798,Jagermeister 20ml x Chai,14,Wine&Liquor,1401,Liquor,140102,Cordials/Brandy/Cognac,37,1961000,0,1782734,1195100.0
4,45116568,Lotte Kem mochi Yukimi Daifuku 90ml x hộp,5,Ice Cream,501,Packaged Ice Cream/Novelties,50104,Other Ice Cream,256,8455000,33000,7828768,5585408.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4706,18935012413328,THÙNG 24SABECO Lager Bia 330ml x Lon,6,Beer,602,Domestic Beer,60201,Domestic Beer Can,5,1600000,0,1454545,1154180.0
4707,18935217400093,THÙNG48THTM STT Không Đường 220ml x Gói,2,Milk,201,Whole Milk,20102,UHT Milk,1,430000,0,398148,372144.0
4708,28935049502337,THÙNG 24_COCA Zero Sleek 320ml x 1 Lon,7,Beverage,702,Carbonated Drinks - CSD,70202,No/Less Sugar CSD,1,194000,66000,179630,199560.0
4709,48934588063068,THÙNG 12AQUAFINA Nước tinh khiết 1.5L x chai,7,Beverage,701,Bottled Water,70101,Purified water,1,140000,0,129630,104280.0


In [6]:
top_20_barcodes = df_grouped.sort_values('netSales', ascending=False).head(20)
top_20_barcodes

Unnamed: 0,barcode,itemName,divisionCode,divisionName,categoryCode,categoryName,subCategoryCode,subCategoryName,itemQty,grossSales,itemDiscount,netSales,totalCostPrice
401,2501031190510,Trà tắc ly lớn x 1 ly,25,FF Beverage,2506,FF Beverage Onsite,250603,Cold Local Drinks,2311,43550000,2670000,40324710,23478604.5
4171,8938502525368,Bánh bao nhân thịt heo trứng muối xá xíu,26,FF Onsite,2604,Steamed Bun,260402,Savoury Buns,875,25375000,0,23495491,16157901.93
294,2010805000363,Hotdog 25 Signature x cái,26,FF Onsite,2602,Other Cuisine,260201,Other Cuisine Sub,897,25100000,16000,23240808,17090693.49
343,2250103000058,TP. Bánh giò nhân thịt 150g x 1 cái,26,FF Onsite,2604,Steamed Bun,260403,Others Steamed bun,1677,23478000,0,21738951,12959460.36
529,2701010000179,GCCT Cơm nghêu mẹ nấu x hộp,27,FF Offsite,2701,Rice Offsite,270101,Bento,589,22382000,0,20723965,13646000.0
491,2602020019048,Tteokbokki Signature (TC) x tô,26,FF Onsite,2602,Other Cuisine,260202,Korean cuisine Sub,706,21232000,0,19659415,13707519.0
4169,8938502525344,BB 2 trứng cút 150g x 1 Cái,26,FF Onsite,2604,Steamed Bun,260402,Savoury Buns,1038,20332000,0,18826041,12822035.19
261,2010103000010,Milo ly lớn x 1ly,25,FF Beverage,2506,FF Beverage Onsite,250602,Fountain-Non-carbonated,810,19480000,770000,18037006,12020858.92
2537,8934822112332,TIGER Bia lon cao 330ml x Lon,6,Beer,601,Import Beer,60101,Import Beer Can,947,19143000,156000,17402721,12321757.92
4168,8938502525337,BB 1 trứng cút 120g x 1 Cái,26,FF Onsite,2604,Steamed Bun,260402,Savoury Buns,1046,18284000,34000,16929850,9693662.0


In [7]:
# Filter item Milo 2010103000010
df[df['barcode'] == 2602020019048]

Unnamed: 0,barcode,itemName,divisionCode,divisionName,categoryCode,categoryName,subCategoryCode,subCategoryName,itemQty,grossSales,itemDiscount,netSales,totalCostPrice
5,2602020019048,Tteokbokki Signature (TC) x tô,26,FF Onsite,2602,Other Cuisine,260202,Korean cuisine Sub,205,6150000,0,5694489,3948126.0
3585,2602020019048,Tteokbokki Signature (TC) x tô,26,FF Onsite,2602,Other Cuisine,260202,Korean cuisine Sub,191,5782000,0,5353746,3741499.0
5507,2602020019048,Tteokbokki Signature (TC) x tô,26,FF Onsite,2602,Cuisine Cat,260202,Korean cuisine Sub,174,5561600,6400,5149687,3408486.0
7556,2602020019048,Tteokbokki Signature (TC) x tô,26,FF Onsite,2602,Other Cuisine,260202,Korean cuisine Sub,103,3090000,0,2861134,1966167.0
11020,2602020019048,Tteokbokki Signature (TC) x tô,26,FF Onsite,2602,Other Cuisine,260202,Korean cuisine Sub,207,6210000,0,5750046,4051727.0
16413,2602020019048,Tteokbokki Signature (TC) x tô,26,FF Onsite,2602,Cuisine Cat,260202,Korean cuisine Sub,172,5504000,0,5096352,3369308.0


In [8]:
df_summed = df.groupby('barcode').agg({
    'itemQty': 'sum',
    'grossSales': 'sum',
    'itemDiscount': 'sum',
    'netSales': 'sum',
    'totalCostPrice': 'sum'
}).reset_index()

df_summed

Unnamed: 0,barcode,itemQty,grossSales,itemDiscount,netSales,totalCostPrice
0,21202010,7,280000,0,266665,192500.0
1,23133289,51,1479000,0,1369447,904092.3
2,40677767,1,293000,0,266364,189000.0
3,40677798,37,1961000,0,1782734,1195100.0
4,45116568,256,8455000,33000,7828768,5585408.0
...,...,...,...,...,...,...
3199,18935012413328,5,1600000,0,1454545,1154180.0
3200,18935217400093,1,430000,0,398148,372144.0
3201,28935049502337,1,194000,66000,179630,199560.0
3202,48934588063068,1,140000,0,129630,104280.0


In [9]:
df_itemName = df.groupby('barcode')['itemName'].first()
df_summed['itemName'] = df_summed['barcode'].map(df_itemName)
df_summed

Unnamed: 0,barcode,itemQty,grossSales,itemDiscount,netSales,totalCostPrice,itemName
0,21202010,7,280000,0,266665,192500.0,HH Cam Úc 1 trái
1,23133289,51,1479000,0,1369447,904092.3,GPS Thạch hồng sâm Innerset 15g x gói
2,40677767,1,293000,0,266364,189000.0,Jagermeister 200ml x Chai
3,40677798,37,1961000,0,1782734,1195100.0,Jagermeister 20ml x Chai
4,45116568,256,8455000,33000,7828768,5585408.0,Lotte Kem mochi Yukimi Daifuku 90ml x hộp
...,...,...,...,...,...,...,...
3199,18935012413328,5,1600000,0,1454545,1154180.0,THÙNG 24SABECO Lager Bia 330ml x Lon
3200,18935217400093,1,430000,0,398148,372144.0,THÙNG48THTM STT Không Đường 220ml x Gói
3201,28935049502337,1,194000,66000,179630,199560.0,THÙNG 24_COCA Zero Sleek 320ml x 1 Lon
3202,48934588063068,1,140000,0,129630,104280.0,THÙNG 12AQUAFINA Nước tinh khiết 1.5L x chai


In [10]:

top_20_barcodes = df_summed.sort_values('netSales', ascending=False).head(20)
top_20_barcodes

Unnamed: 0,barcode,itemQty,grossSales,itemDiscount,netSales,totalCostPrice,itemName
267,2501031190510,2415,45630000,2670000,42250680,24536377.5,Trà tắc ly lớn x 1 ly
211,2010805000363,1257,35180000,16000,32574168,23989877.6,Hotdog 25 Signature x cái
322,2602020019048,1052,32297600,6400,29905454,20485313.0,Tteokbokki Signature (TC) x tô
196,2010103000010,1178,28680000,770000,26555470,17381323.16,Milo ly lớn x 1ly
2851,8938502525368,875,25375000,0,23495491,16157901.93,Bánh bao nhân thịt heo trứng muối xá xíu
78,260202000017,805,24150000,0,22361288,14575237.36,Tteokbokki Signature x 1 phần
234,2250103000058,1677,23478000,0,21738951,12959460.36,TP. Bánh giò nhân thịt 150g x 1 cái
352,2701010000179,614,23332000,0,21603590,14221000.0,GCCT Cơm nghêu mẹ nấu x hộp
2849,8938502525344,1038,20332000,0,18826041,12822035.19,BB 2 trứng cút 150g x 1 Cái
261,2501031190374,1676,19255000,857000,17828503,9138164.0,Nestea ly lớn (New) x 1 ly
