In [208]:
import pandas as pd
import hvplot.pandas
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [209]:
# Create an ExcelFile object
excel_file = pd.ExcelFile('US_Regional_Sales_Data.xls')

In [210]:
# Get the list of sheet names
sheet_names = excel_file.sheet_names

# Print the sheet names
print(sheet_names)

['Sales Orders Sheet', 'Customers Sheet', 'Store Locations Sheet', 'Products Sheet', 'Regions Sheet', 'Sales Team Sheet']


In [211]:
# Read Excel file into a Pandas DataFrame
Sales_df = pd.read_excel('US_Regional_Sales_Data.xls', sheet_name='Sales Orders Sheet')
Customers_df = pd.read_excel('US_Regional_Sales_Data.xls', sheet_name='Customers Sheet')
Stores_df = pd.read_excel('US_Regional_Sales_Data.xls', sheet_name='Store Locations Sheet')
Products_df = pd.read_excel('US_Regional_Sales_Data.xls', sheet_name='Products Sheet')
Regions_df = pd.read_excel('US_Regional_Sales_Data.xls', sheet_name='Regions Sheet')
Team_df = pd.read_excel('US_Regional_Sales_Data.xls', sheet_name='Sales Team Sheet')


In [212]:
Sales_df.columns

Index(['OrderNumber', 'Sales Channel', 'WarehouseCode', 'ProcuredDate',
       'OrderDate', 'ShipDate', 'DeliveryDate', 'CurrencyCode', '_SalesTeamID',
       '_CustomerID', '_StoreID', '_ProductID', 'Order Quantity',
       'Discount Applied', 'Unit Price', 'Unit Cost'],
      dtype='object')

In [213]:
Sales_df.shape

(7991, 16)

In [214]:
Sales_df.sample(10)

Unnamed: 0,OrderNumber,Sales Channel,WarehouseCode,ProcuredDate,OrderDate,ShipDate,DeliveryDate,CurrencyCode,_SalesTeamID,_CustomerID,_StoreID,_ProductID,Order Quantity,Discount Applied,Unit Price,Unit Cost
6827,SO - 0006928,Distributor,WARE-PUJ1005,2020-03-10,2020-08-17,2020-08-26,2020-08-31,USD,25,15,269,3,1,0.3,2680.0,1179.2
7512,SO - 0007613,Wholesale,WARE-XYS1001,2020-06-18,2020-11-03,2020-11-11,2020-11-16,USD,28,32,18,45,4,0.2,268.0,152.76
6120,SO - 0006221,In-Store,WARE-UHY1004,2020-03-10,2020-05-27,2020-06-21,2020-06-25,USD,3,50,220,17,1,0.05,2398.6,1127.342
1365,SO - 0001466,Online,WARE-XYS1001,2018-07-19,2018-11-10,2018-11-21,2018-11-26,USD,14,24,6,32,5,0.4,1172.5,715.225
3054,SO - 0003155,Distributor,WARE-MKL1006,2019-02-04,2019-05-24,2019-05-27,2019-05-28,USD,22,38,339,44,7,0.05,2385.2,1240.304
2186,SO - 0002287,In-Store,WARE-XYS1001,2018-10-27,2019-02-07,2019-03-06,2019-03-10,USD,4,24,30,22,2,0.1,187.6,93.8
7954,SO - 0008055,Online,WARE-UHY1004,2020-09-26,2020-12-27,2020-12-29,2021-01-08,USD,18,14,207,32,4,0.075,5554.3,3499.209
146,SO - 000247,Online,WARE-UHY1004,2018-04-10,2018-06-17,2018-07-08,2018-07-09,USD,17,34,217,16,5,0.05,4020.0,2653.2
6350,SO - 0006451,In-Store,WARE-UHY1004,2020-03-10,2020-06-22,2020-07-19,2020-07-22,USD,5,8,256,3,5,0.05,6076.9,4618.444
4747,SO - 0004848,Distributor,WARE-NMK1003,2019-08-23,2019-12-17,2019-12-20,2019-12-21,USD,25,31,97,11,5,0.075,5922.8,2428.348


In [215]:
Sales_df['Recency'] = max(Sales_df['OrderDate'])-Sales_df['OrderDate']

In [216]:
Sales_df.sample(10)


Unnamed: 0,OrderNumber,Sales Channel,WarehouseCode,ProcuredDate,OrderDate,ShipDate,DeliveryDate,CurrencyCode,_SalesTeamID,_CustomerID,_StoreID,_ProductID,Order Quantity,Discount Applied,Unit Price,Unit Cost,Recency
1706,SO - 0001807,Online,WARE-MKL1006,2018-10-27,2018-12-20,2019-01-04,2019-01-07,USD,15,24,333,41,8,0.075,5326.5,4207.935,741 days
627,SO - 000728,Online,WARE-NMK1003,2018-04-10,2018-08-12,2018-08-23,2018-08-29,USD,17,21,125,30,7,0.075,1876.0,1500.8,871 days
5643,SO - 0005744,In-Store,WARE-UHY1004,2019-12-01,2020-03-31,2020-04-20,2020-04-30,USD,7,44,235,45,2,0.1,3155.7,1388.508,274 days
3400,SO - 0003501,Online,WARE-MKL1006,2019-05-15,2019-07-04,2019-07-21,2019-07-28,USD,20,10,350,25,5,0.075,2760.4,1739.052,545 days
6270,SO - 0006371,Online,WARE-NBV1002,2020-03-10,2020-06-12,2020-07-02,2020-07-11,USD,17,27,86,3,6,0.2,2385.2,1025.636,201 days
5712,SO - 0005813,Distributor,WARE-XYS1001,2019-12-01,2020-04-07,2020-04-29,2020-05-07,USD,21,41,3,42,1,0.2,1159.1,788.188,267 days
7747,SO - 0007848,Distributor,WARE-UHY1004,2020-06-18,2020-11-30,2020-12-25,2020-12-30,USD,20,2,221,41,5,0.15,1909.5,1279.365,30 days
615,SO - 000716,Distributor,WARE-NMK1003,2018-04-10,2018-08-10,2018-09-02,2018-09-10,USD,23,16,156,1,4,0.075,3966.4,2816.144,873 days
7341,SO - 0007442,Distributor,WARE-UHY1004,2020-06-18,2020-10-16,2020-10-29,2020-11-07,USD,23,25,207,8,6,0.4,864.3,363.006,75 days
2090,SO - 0002191,Distributor,WARE-MKL1006,2018-10-27,2019-01-27,2019-02-13,2019-02-14,USD,21,37,365,28,2,0.05,6190.8,2538.228,703 days


In [217]:
Sales_df = Sales_df[['_CustomerID', 'Sales Channel','Order Quantity',
       'Discount Applied', 'Unit Price', 'Unit Cost','Recency']]

In [218]:
Sales_df['Recency'] = Sales_df['Recency'].astype('string').str.replace('days', '')
Sales_df['Total Sales'] = Sales_df['Unit Price'] - (Sales_df['Unit Price']*Sales_df['Discount Applied'] - Sales_df['Unit Cost'])


In [219]:
Sales_df.dtypes

_CustomerID                  int64
Sales Channel               object
Order Quantity               int64
Discount Applied           float64
Unit Price                 float64
Unit Cost                  float64
Recency             string[python]
Total Sales                float64
dtype: object

In [220]:
Sales_df.sample(10)

Unnamed: 0,_CustomerID,Sales Channel,Order Quantity,Discount Applied,Unit Price,Unit Cost,Recency,Total Sales
4834,48,Online,8,0.05,5326.5,4367.73,368,9427.905
1785,4,Online,5,0.075,1005.0,653.25,734,1582.875
7046,43,Online,2,0.2,6391.8,4218.588,108,9332.028
1490,7,In-Store,7,0.075,3946.3,1578.52,766,5228.8475
3322,26,Online,2,0.05,2237.8,1566.46,554,3692.37
3389,29,Online,1,0.075,1038.5,810.03,547,1770.6425
5698,39,In-Store,8,0.05,167.5,132.325,268,291.45
4937,13,In-Store,5,0.1,1735.3,1145.298,356,2707.068
2493,36,In-Store,2,0.05,1701.8,1140.206,652,2756.916
918,14,In-Store,5,0.075,1849.2,1331.424,838,3041.934


In [221]:
Sales_df.isnull().sum()

_CustomerID         0
Sales Channel       0
Order Quantity      0
Discount Applied    0
Unit Price          0
Unit Cost           0
Recency             0
Total Sales         0
dtype: int64

In [222]:
# Check  duplicates
duplicates = Sales_df.duplicated(keep='first')
duplicate_rows = Sales_df[duplicates]
duplicate_rows

Unnamed: 0,_CustomerID,Sales Channel,Order Quantity,Discount Applied,Unit Price,Unit Cost,Recency,Total Sales
