In [2]:
#import libraries
import pandas as pd
import os
import configparser

# Read the value from the config.ini file
config = configparser.ConfigParser()
config.read('config.ini')
output_file_dir = config.get('path', 'output_file_dir')
output_file_name_net_sales = config.get('path', 'output_file_name_net_sales')
cateFileDir = config.get('path', 'cateFileDir')
# Read the category file
try:
    df_cate = pd.read_excel(cateFileDir)
except FileNotFoundError as e:
    # Hard code the category file path
    df_cate = pd.read_excel(r'...')# Hard code the category file path

# Read the net sales file
net_sales_data = os.path.join(output_file_dir, output_file_name_net_sales) + '.csv'

# Read the merged file csv
# input_file should be the path of the file
try:
    input_file = net_sales_data
    df = pd.read_csv(input_file)
except FileNotFoundError as e:
    # Hard code the path of the file
    input_file = r"..."
    df = pd.read_csv(input_file)
df.dropna(inplace=True)


In [3]:
print(df.head(5))
print(df_cate.head(5))


         Date      Time           InvoiceID        Barcode  Total Include VAT
0  2023-01-01  00:40:59  VN0001010101230001  8935049500544             7000.0
1  2023-01-01  00:41:18  VN0001010101230002  8938512632025            12000.0
2  2023-01-01  00:41:55  VN0001010101230003  8936011773416            25000.0
3  2023-01-01  00:41:55  VN0001010101230003  8936079121761            13000.0
4  2023-01-01  00:41:55  VN0001010101230003  8850453017528            13000.0
         Barcode Division Name   Category Name          SubCategory Name   
0  9956991810612   LivingGoods    Living goods  Other Non-edible Grocery  \
1       96183519         Candy  Candy Category                Hard Candy   
2       96118528         Candy  Candy Category                Hard Candy   
3       96118511         Candy  Candy Category                Hard Candy   
4       96118504         Candy  Candy Category                Hard Candy   

                                           Item Name  
0                   

In [4]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Date,Time,InvoiceID,Barcode,Total Include VAT
0,2023-01-01,00:40:59,VN0001010101230001,8935049500544,7000.0
1,2023-01-01,00:41:18,VN0001010101230002,8938512632025,12000.0
2,2023-01-01,00:41:55,VN0001010101230003,8936011773416,25000.0
3,2023-01-01,00:41:55,VN0001010101230003,8936079121761,13000.0
4,2023-01-01,00:41:55,VN0001010101230003,8850453017528,13000.0
...,...,...,...,...,...
68978410,2023-07-05,18:58:59,VN0061010507230308,8935250402019,48000.0
68978411,2023-07-05,13:02:59,VN0061010507230185,8934803071085,13000.0
68978412,2023-07-05,09:15:59,VN0061010507230092,8935090910873,36000.0
68978413,2023-07-05,09:15:59,VN0061010507230092,8935005800015,12000.0


In [5]:
# Convert the data type of the column 'Barcode' to string
df_cate['Barcode'] = df_cate['Barcode'].astype(str)
df['Barcode'] = df['Barcode'].astype(str)

# Merge the two dataframes
df_merged = pd.merge(df, df_cate, on='Barcode', how='left')

In [6]:
df_merged

Unnamed: 0,Date,Time,InvoiceID,Barcode,Total Include VAT,Division Name,Category Name,SubCategory Name,Item Name
0,2023-01-01,00:40:59,VN0001010101230001,8935049500544,7000.0,Beverage,Bottled Water,Mineral water,DASANI Nước khoáng 510ml x1 Chai
1,2023-01-01,00:41:18,VN0001010101230002,8938512632025,12000.0,Beverage,Bottled Water,Purified water,SATORI Nước tinh khiết 500ml x 1 Chai
2,2023-01-01,00:41:55,VN0001010101230003,8936011773416,25000.0,Ice Cream,Packaged Ice Cream/Novelties,Other Ice Cream,#Celano Kem Bánh Cá Trân Châu Dừa Tắc 70g * 1 cây
3,2023-01-01,00:41:55,VN0001010101230003,8936079121761,13000.0,Snacks,Potato Chips,Potato Snacks,Lay's Wavy Vị Thăn Bò Nướng Texas 58gr x 1 gói
4,2023-01-01,00:41:55,VN0001010101230003,8850453017528,13000.0,Ice Cream,Packaged Ice Cream/Novelties,Stick,MILO Kem que Magma 55g x1 Cây
...,...,...,...,...,...,...,...,...,...
35898463,2023-07-05,18:58:59,VN0061010507230308,8935250402019,48000.0,LivingGoods,Paper/Plastic/Foil Products,Tissue,EMOS Khan An 100pcs x 1 Gói
35898464,2023-07-05,13:02:59,VN0061010507230185,8934803071085,13000.0,Snacks,Non-potato Chips,Salty Snacks,OISHI Snack Tôm cay nồng 70g x gói
35898465,2023-07-05,09:15:59,VN0061010507230092,8935090910873,36000.0,Snacks,Snacks for Drinks,Nuts/Seeds,DanD Hạnh Nhân Không Muối 50g x 1 gói
35898466,2023-07-05,09:15:59,VN0061010507230092,8935005800015,12000.0,Beverage,Bottled Water,Mineral water,LAVIE Nuoc Khoang 750ml x 1 Chai


In [8]:
df_filter = df_merged[['InvoiceID', 'Barcode', 'Total Include VAT', 'Division Name']]

df_filter

Unnamed: 0,InvoiceID,Barcode,Total Include VAT,Division Name
0,VN0001010101230001,8935049500544,7000.0,Beverage
1,VN0001010101230002,8938512632025,12000.0,Beverage
2,VN0001010101230003,8936011773416,25000.0,Ice Cream
3,VN0001010101230003,8936079121761,13000.0,Snacks
4,VN0001010101230003,8850453017528,13000.0,Ice Cream
...,...,...,...,...
35898463,VN0061010507230308,8935250402019,48000.0,LivingGoods
35898464,VN0061010507230185,8934803071085,13000.0,Snacks
35898465,VN0061010507230092,8935090910873,36000.0,Snacks
35898466,VN0061010507230092,8935005800015,12000.0,Beverage


In [21]:
group_by_invoice = df_filter.groupby('InvoiceID').agg({'Barcode': lambda x: ', '.join(x.astype(str)), 
                                      'Total Include VAT': 'sum', 
                                      'Division Name': lambda x: ', '.join(x.astype(str))}).reset_index()
group_by_invoice

KeyboardInterrupt: 

In [None]:
# group_by_invoice.to_csv('group_by_invoice.csv', index=False, encoding='utf-8-sig')

In [20]:
df_filter = df_merged[['Date', 'Time', 'InvoiceID']]



MemoryError: Unable to allocate 822. MiB for an array with shape (3, 35898468) and data type object

In [None]:


# Merge the two dataframes on 'InvoiceID' and keep only 'Date' and 'Time' from df_merged
test_df = pd.merge(df_filter, group_by_invoice, on='InvoiceID', how='left')

print(test_df)
