In [13]:
import pandas as pd
import glob

def load_salesstore_data(data_folder):
    # Get the path to all files containing 'salesstore' and ending with '.xlsx'
    salesstore_files = glob.glob(data_folder + "/salebystore*.xlsx")
    
    # If no files were found, raise an error
    if not salesstore_files:
        raise FileNotFoundError(f"No files found in {data_folder} that match the pattern '/salebystore*.xlsx'")
    
    # Load all the salesstore data into a list of pandas dataframes
    salesstore_dfs = [pd.read_excel(file) for file in salesstore_files]
    
    # Concatenate all the dataframes into one
    salesstore_df = pd.concat(salesstore_dfs, ignore_index=True)
    
    return salesstore_df

# Define the data folder
data_folder1 = r'C:\Users\PC\Dropbox\Projects\data_xls'
data_folder2 = r'C:\Users\trieu.pham\Dropbox\Projects\data_xls'
data_folder3 = r'/Users/trieupham/Dropbox/Projects/data_xls'

try:
    df = load_salesstore_data(data_folder1)
except FileNotFoundError:
    try:
        df = load_salesstore_data(data_folder2)
    except FileNotFoundError:
        df = load_salesstore_data(data_folder3)
print(df.head())

  storeCode                     storeName  regionCode regionName  billCount  \
0    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        245   
1    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        233   
2    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        215   
3    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        235   
4    VN0265  GS25 Hoang Anh Thanh Binh_FC         107     HCM_Q7        243   

   customerCount  qty  grossSales  itemDiscount  billDiscount  netSales  \
0             78  629    12164000        551000             0  11270550   
1             92  527     9453000        341000             0   8748688   
2            108  565     9660000        289000             0   8939741   
3            110  534    10229000        320000             0   9459571   
4            110  662    12128000        282000             0  11231082   

   vatAmount  totalCostPrice  cancelBillCount  exceptItemActionCount   dat

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   storeCode              157 non-null    object 
 1   storeName              157 non-null    object 
 2   regionCode             157 non-null    int64  
 3   regionName             157 non-null    object 
 4   billCount              157 non-null    int64  
 5   customerCount          157 non-null    int64  
 6   qty                    157 non-null    int64  
 7   grossSales             157 non-null    int64  
 8   itemDiscount           157 non-null    int64  
 9   billDiscount           157 non-null    int64  
 10  netSales               157 non-null    int64  
 11  vatAmount              157 non-null    int64  
 12  totalCostPrice         157 non-null    float64
 13  cancelBillCount        157 non-null    int64  
 14  exceptItemActionCount  157 non-null    int64  
 15  dateKe

In [15]:
# Remove duplicates rows in df
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   storeCode              157 non-null    object 
 1   storeName              157 non-null    object 
 2   regionCode             157 non-null    int64  
 3   regionName             157 non-null    object 
 4   billCount              157 non-null    int64  
 5   customerCount          157 non-null    int64  
 6   qty                    157 non-null    int64  
 7   grossSales             157 non-null    int64  
 8   itemDiscount           157 non-null    int64  
 9   billDiscount           157 non-null    int64  
 10  netSales               157 non-null    int64  
 11  vatAmount              157 non-null    int64  
 12  totalCostPrice         157 non-null    float64
 13  cancelBillCount        157 non-null    int64  
 14  exceptItemActionCount  157 non-null    int64  
 15  dateKe

In [16]:
import pandas as pd

# Convert the 'dateKey' column to datetime format
df['dateKey'] = pd.to_datetime(df['dateKey'], format='%Y%m%d')

# Extract the weekday from the date
df['weekday'] = df['dateKey'].dt.day_name()

# Group by the weekday and calculate the mean of 'netSales'
average_sales = df.groupby('weekday')['netSales'].mean()

# Sort the weekdays descendingly by the average net sales
average_sales = average_sales.sort_values(ascending=False)

# Change sale format to currency, with two decimal places
average_sales = average_sales.map('{:,.2f}'.format)

print(average_sales)

weekday
Saturday     14,968,021.91
Sunday       13,806,161.05
Friday       12,299,519.95
Wednesday    12,005,959.43
Tuesday      11,996,631.26
Thursday     11,636,482.48
Monday       11,399,731.82
Name: netSales, dtype: object


In [17]:
# Extract the year and month from the 'dateKey' column
df['year_month'] = df['dateKey'].dt.to_period('M')

# Group by the 'year_month' column, and calculate the sum of 'netSales'
monthly_sales = df.groupby('year_month')['netSales'].sum()

# Count the number of unique days in each month
days_in_month = df.groupby('year_month')['dateKey'].nunique()

# Calculate the average daily sales per month
average_daily_sales = monthly_sales / days_in_month

# Sort the average daily sales in descending order
average_daily_sales = average_daily_sales.sort_values(ascending=False)

# Change the sales format to currency, with two decimal places
average_daily_sales = average_daily_sales.map('{:,.2f}'.format)

print(average_daily_sales)

year_month
2024-06    15,251,325.33
2024-05    13,664,188.94
2024-04    13,582,927.00
2024-03    12,429,899.94
2024-01    11,756,127.50
2024-02    10,810,962.24
Freq: M, dtype: object


In [18]:
df.head()

Unnamed: 0,storeCode,storeName,regionCode,regionName,billCount,customerCount,qty,grossSales,itemDiscount,billDiscount,netSales,vatAmount,totalCostPrice,cancelBillCount,exceptItemActionCount,dateKey,weekday,year_month
0,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,245,78,629,12164000,551000,0,11270550,893450,7807715.0,6,12,2024-02-29,Thursday,2024-02
1,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,233,92,527,9453000,341000,0,8748688,704312,5954266.0,1,18,2024-02-28,Wednesday,2024-02
2,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,215,108,565,9660000,289000,0,8939741,720259,6103071.0,1,13,2024-02-27,Tuesday,2024-02
3,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,235,110,534,10229000,320000,0,9459571,769429,6473921.0,3,9,2024-02-26,Monday,2024-02
4,VN0265,GS25 Hoang Anh Thanh Binh_FC,107,HCM_Q7,243,110,662,12128000,282000,0,11231082,896918,7624871.0,1,20,2024-02-25,Sunday,2024-02


In [24]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# Convert 'year_month' to a numerical format
df['year_month_num'] = df['year_month'].dt.to_timestamp().dt.year * 12 + df['year_month'].dt.to_timestamp().dt.month

# Group by the 'year_month_num' column, and calculate the sum of 'netSales' and count of 'dateKey'
grouped_df = df.groupby('year_month_num').agg({'netSales': 'sum', 'dateKey': 'nunique'})

# Calculate the average daily sales per month
average_daily_sales = grouped_df['netSales'] / grouped_df['dateKey']

# Fit a linear regression model
model = LinearRegression()
model.fit(grouped_df.index.values.reshape(-1, 1), average_daily_sales)

# Predict the month and year when the average daily sales will reach the target
target = 15000000
predicted_month_num = int((target - model.intercept_) / model.coef_[0])
predicted_year, predicted_month = divmod(predicted_month_num, 12)

# Print the prediction, target format to currency, with two decimal places
target = '{:,.2f}'.format(target)
print(f'The average daily sales will reach {target} in {predicted_month}/{predicted_year}')

The average daily sales will reach 15,000,000.00 in 6/2024
