In [1]:
import pandas as pd
import glob

def load_salesstore_data(data_folder):
    # Get the path to all files containing 'salesstore' and ending with '.xlsx'
    salesstore_files = glob.glob(data_folder + "/salebystore*.xlsx")
    
    # If no files are found, raise a FileNotFoundError
    if not salesstore_files:
        raise FileNotFoundError(f"No 'salebycategory*.xlsx' files found in {data_folder}")

    # Load all the salesstore data into a list of pandas dataframes
    salesstore_dfs = [pd.read_excel(file) for file in salesstore_files]
    
    # Concatenate all the dataframes into one
    salesstore_df = pd.concat(salesstore_dfs, ignore_index=True)
    
    return salesstore_df

# Define the data folder
data_folder1 = r'C:\Users\PC\Dropbox\Projects\data_xls'
data_folder2 = r'C:\Users\trieu.pham\Dropbox\Projects\data_xls'
data_folder3 = r'/Users/trieupham/Dropbox/Projects/data_xls/salebystore'

try:
    df = load_salesstore_data(data_folder1)
except FileNotFoundError:
    try:
        df = load_salesstore_data(data_folder2)
    except FileNotFoundError:
        try:
            df = load_salesstore_data(data_folder3)
        except FileNotFoundError:
            print("Data folder not found")
            raise
print(df.head())

  storeCode                     storeName  regionCode regionName  billCount  \
0    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        246   
1    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        201   
2    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        220   
3    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        238   
4    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        240   

   customerCount  qty  grossSales  itemDiscount  billDiscount  netSales  \
0             86  601    11640000        221000             0  10773202   
1             62  525    10641000        274000             0   9851424   
2            109  596    11997000        315000             0  11100568   
3            123  711    14858000        276000             0  13751452   
4            121  762    15867000        384000             0  14699419   

   vatAmount  totalCostPrice  cancelBillCount  exceptItemActionCount   dat

In [2]:
#make sure there are not any duplicated rows
print(df.duplicated().sum())

#drop duplicated rows
df = df.drop_duplicates()

0


In [3]:
import pandas as pd

# Convert the 'dateKey' column to datetime format
df['dateKey'] = pd.to_datetime(df['dateKey'], format='%Y%m%d')

# Extract the weekday from the date
df['weekday'] = df['dateKey'].dt.day_name()

# Group by the weekday and calculate the mean of 'netSales'
average_sales = df.groupby('weekday')['netSales'].mean()

# Sort the weekdays descendingly by the average net sales
average_sales = average_sales.sort_values(ascending=False)

# Change sale format to currency, with two decimal places
average_sales = average_sales.map('{:,.2f}'.format)

print(average_sales)

weekday
Saturday     15,208,778.12
Sunday       14,312,689.73
Friday       12,855,466.38
Tuesday      12,336,598.62
Wednesday    12,269,630.31
Thursday     11,903,509.62
Monday       11,831,114.96
Name: netSales, dtype: object


In [4]:
# Extract the year and month from the 'dateKey' column
df['year_month'] = df['dateKey'].dt.to_period('M')

# Group by the 'year_month' column, and calculate the sum of 'netSales'
monthly_sales = df.groupby('year_month')['netSales'].sum()

# Count the number of unique days in each month
days_in_month = df.groupby('year_month')['dateKey'].nunique()

# Calculate the average daily sales per month
average_daily_sales = monthly_sales / days_in_month

# Sort the average daily sales in descending order
average_daily_sales = average_daily_sales.sort_values(ascending=False)

# Change the sales format to currency, with two decimal places
average_daily_sales = average_daily_sales.map('{:,.2f}'.format)

print(average_daily_sales)

year_month
2024-06    15,474,169.03
2024-05    13,664,188.94
2024-04    13,582,927.00
2024-03    12,429,899.94
2024-01    11,756,127.50
2024-02    10,810,962.24
Freq: M, dtype: object


In [5]:
df.head()

Unnamed: 0,storeCode,storeName,regionCode,regionName,billCount,customerCount,qty,grossSales,itemDiscount,billDiscount,netSales,vatAmount,totalCostPrice,cancelBillCount,exceptItemActionCount,dateKey,weekday,year_month
0,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,246,86,601,11640000,221000,0,10773202,866798,7342082.0,3,11,2024-01-31,Wednesday,2024-01
1,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,201,62,525,10641000,274000,0,9851424,789576,6760602.0,10,14,2024-01-30,Tuesday,2024-01
2,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,220,109,596,11997000,315000,0,11100568,896432,7534077.0,2,19,2024-01-29,Monday,2024-01
3,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,238,123,711,14858000,276000,0,13751452,1106548,9256518.0,2,18,2024-01-28,Sunday,2024-01
4,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,240,121,762,15867000,384000,0,14699419,1167581,9793124.0,1,17,2024-01-27,Saturday,2024-01


In [6]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# Convert 'year_month' to a numerical format
df['year_month_num'] = df['year_month'].dt.to_timestamp().dt.year * 12 + df['year_month'].dt.to_timestamp().dt.month

# Group by the 'year_month_num' column, and calculate the sum of 'netSales' and count of 'dateKey'
grouped_df = df.groupby('year_month_num').agg({'netSales': 'sum', 'dateKey': 'nunique'})

# Calculate the average daily sales per month
average_daily_sales = grouped_df['netSales'] / grouped_df['dateKey']

# Fit a linear regression model
model = LinearRegression()
model.fit(grouped_df.index.values.reshape(-1, 1), average_daily_sales)

# Predict the month and year when the average daily sales will reach the target
target = 16000000
predicted_month_num = int((target - model.intercept_) / model.coef_[0])
predicted_year, predicted_month = divmod(predicted_month_num, 12)

# Print the prediction, target format to currency, with two decimal places
target = '{:,.2f}'.format(target)
print(f'The average daily sales will reach {target} in {predicted_month}/{predicted_year}')

The average daily sales will reach 16,000,000.00 in 7/2024
