## Urban Sales Metrics of Myanmar HUB

This notebook presents plots derived from historical sales data collected from three distinct supermarkets. The original dataset is accessible at the following URL:
 https://www.kaggle.com/datasets/aungpyaeap/supermarket-sales

The plots presented here will be used as baseline to construct an interactive dash app.

In [505]:
#Libraries 
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve, validation_curve

In [506]:
# CSV File in Githubt (added ?raw=true at the end of the URL or it will not parse it correctly) 
df = pd.read_csv("https://github.com/Salvatore-Rocha/Supermarket-sales/blob/002314ff6501373a489db96a35c9bd205fdbff8b/supermarket_sales.csv?raw=true")

#View the first 5 rows of the Data frame
df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [507]:
#Get type of data for each column
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Invoice ID               1000 non-null   object 
 1   Branch                   1000 non-null   object 
 2   City                     1000 non-null   object 
 3   Customer type            1000 non-null   object 
 4   Gender                   1000 non-null   object 
 5   Product line             1000 non-null   object 
 6   Unit price               1000 non-null   float64
 7   Quantity                 1000 non-null   int64  
 8   Tax 5%                   1000 non-null   float64
 9   Total                    1000 non-null   float64
 10  Date                     1000 non-null   object 
 11  Time                     1000 non-null   object 
 12  Payment                  1000 non-null   object 
 13  cogs                     1000 non-null   float64
 14  gross margin percentage  

In [508]:
#Fix Date; object to Datetime
df['Date'] = pd.to_datetime(df['Date'])
df = df.round(2)

#Re-check type of data
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Invoice ID               1000 non-null   object        
 1   Branch                   1000 non-null   object        
 2   City                     1000 non-null   object        
 3   Customer type            1000 non-null   object        
 4   Gender                   1000 non-null   object        
 5   Product line             1000 non-null   object        
 6   Unit price               1000 non-null   float64       
 7   Quantity                 1000 non-null   int64         
 8   Tax 5%                   1000 non-null   float64       
 9   Total                    1000 non-null   float64       
 10  Date                     1000 non-null   datetime64[ns]
 11  Time                     1000 non-null   object        
 12  Payment                  1000 non-n

In [509]:
#Get unique values for relevant columns
for col in df.select_dtypes(include=['object']):
  if col not in ("Invoice ID","Time","Date"):
    print(col, df[col].unique())

Branch ['A' 'C' 'B']
City ['Yangon' 'Naypyitaw' 'Mandalay']
Customer type ['Member' 'Normal']
Gender ['Female' 'Male']
Product line ['Health and beauty' 'Electronic accessories' 'Home and lifestyle'
 'Sports and travel' 'Food and beverages' 'Fashion accessories']
Payment ['Ewallet' 'Cash' 'Credit card']


In [510]:
# Grouping by Date and City, summing numeric columns, calculating total per day, rounding numeric values
dfs  = df.groupby(["Date","City"]).sum(numeric_only=True).reset_index()
dfs['Total_day'] = dfs.groupby('Date')['Total'].transform(sum)
dfs = dfs.round(2)
dfs


The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.



Unnamed: 0,Date,City,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating,Total_day
0,2019-01-01,Mandalay,180.46,24,73.18,1536.70,1463.52,14.28,73.18,22.0,4745.19
1,2019-01-01,Naypyitaw,192.29,20,39.87,837.17,797.30,19.04,39.87,25.5,4745.19
2,2019-01-01,Yangon,287.20,37,112.92,2371.32,2258.40,23.80,112.92,31.5,4745.19
3,2019-01-02,Mandalay,141.46,31,55.36,1162.79,1107.43,19.04,55.36,24.8,1945.49
4,2019-01-02,Naypyitaw,131.49,10,22.65,475.65,453.00,9.52,22.65,12.3,1945.49
...,...,...,...,...,...,...,...,...,...,...,...
258,2019-03-29,Naypyitaw,111.18,14,46.94,985.70,938.76,9.52,46.94,10.7,4023.25
259,2019-03-29,Yangon,143.80,13,46.55,977.61,931.06,9.52,46.55,13.9,4023.25
260,2019-03-30,Mandalay,263.19,19,82.12,1724.46,1642.34,14.28,82.12,16.1,4487.06
261,2019-03-30,Naypyitaw,249.73,25,68.66,1441.98,1373.32,19.04,68.66,27.6,4487.06


In [511]:
#filtering by month
dfs[dfs["Date"].dt.month == 2]

Unnamed: 0,Date,City,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating,Total_day
91,2019-02-01,Naypyitaw,191.34,24,72.98,1532.60,1459.62,14.28,72.98,19.5,2444.54
92,2019-02-01,Yangon,161.43,16,43.43,911.94,868.51,14.28,43.43,21.3,2444.54
93,2019-02-02,Mandalay,192.24,31,76.64,1609.29,1532.65,19.04,76.64,31.7,4140.96
94,2019-02-02,Naypyitaw,374.76,23,71.99,1511.71,1439.72,28.56,71.99,43.3,4140.96
95,2019-02-02,Yangon,123.94,29,48.57,1019.96,971.39,19.04,48.57,25.4,4140.96
...,...,...,...,...,...,...,...,...,...,...,...
169,2019-02-27,Naypyitaw,280.03,16,72.82,1529.36,1456.54,14.28,72.82,25.1,5859.44
170,2019-02-27,Yangon,285.30,38,116.28,2441.70,2325.43,23.80,116.28,34.2,5859.44
171,2019-02-28,Mandalay,101.39,11,23.56,494.76,471.20,9.52,23.56,13.3,2097.02
172,2019-02-28,Naypyitaw,179.18,10,43.71,917.95,874.24,9.52,43.71,14.2,2097.02


In [512]:
# Enable renderers to save plots as svg files; othewise the html plots won't be displayed once uploaded on Github
# Disable renderers to have interactive inline plots in SVC but not visible once uploaded on Github
# pio.renderers.default = "svg"


#app = Dash(__name__)
#Simple example of barplot
fig = px.bar(dfs[dfs["Date"].dt.month == 2],
             x="Date",
             y="Total",
             color="City",
             text = "Total_day",
             barmode='group',
             text_auto=".2s"
             )
fig.show()

In [513]:
#Count "Gender" totals
df['Gender'].value_counts()

Gender
Female    501
Male      499
Name: count, dtype: int64

In [514]:
#Simple example of pie
gender_by_month = df[df["Date"].dt.month == 2]["Gender"].value_counts()
fig = px.pie(gender_by_month,
             values=gender_by_month.values,
             names=gender_by_month.index,
             title='Total by City, February',
             hole=.3)
fig.update_traces(textposition='inside',
                  textinfo='percent+value')
#textinfo: flaglist string. Any combination of "label", "text", "value", "percent" joined with a "+" OR "none".


fig.show()

In [515]:
#Pie of Payment types
payment_by_month = df[df["Date"].dt.month == 2]["Payment"].value_counts()
fig = px.pie(payment_by_month,
             values=payment_by_month.values,
             names=payment_by_month.index,
             title='Total by City, February',
             hole=.3)
fig.update_traces(textposition='inside',
                  textinfo='percent+value')
#textinfo: flaglist string. Any combination of "label", "text", "value", "percent" joined with a "+" OR "none".


fig.show()

In [516]:
#Processing/filtering data (this showcase the logic behind asking parameters like "Date" in the functions within callbacks)
customer_type_by_month = df[df["Date"].dt.month == 2]["Customer type"].value_counts()
df_ct = customer_type_by_month.reset_index()
df_ct.columns = ['Customer Type', 'Count']
#Adding a dummy column for the X-axis (Otherwise it will always be in separated columns) 
df_ct['Customer_t'] = "Customer Type"
df_ct

Unnamed: 0,Customer Type,Count,Customer_t
0,Member,163,Customer Type
1,Normal,140,Customer Type


In [517]:
#Example of a Bar plot with filtered data 
fig = px.bar(df_ct,
             x="Customer_t",
             y="Count",
             color="Customer Type",
             barmode='stack',
             )
fig.update_layout(title='Customer Type in February')
fig.update_xaxes(title='')
fig

In [518]:
df["Product line"].unique()

array(['Health and beauty', 'Electronic accessories',
       'Home and lifestyle', 'Sports and travel', 'Food and beverages',
       'Fashion accessories'], dtype=object)

In [519]:
#Creating a color map for unique values in "Product line" column
solar_colors = ['#FFDF00', '#FF4D00', '#007FFF', '#4F7942', '#FF8000', '#4B0082']
cmap_prod = dict(zip(df["Product line"].unique(),solar_colors))
cmap_prod

{'Health and beauty': '#FFDF00',
 'Electronic accessories': '#FF4D00',
 'Home and lifestyle': '#007FFF',
 'Sports and travel': '#4F7942',
 'Food and beverages': '#FF8000',
 'Fashion accessories': '#4B0082'}

In [520]:
#Creating a color map for unique values in "Gender" column
solar_colors = ['#FFDF00', '#FF4D00', '#007FFF', '#4F7942', '#FF8000', '#4B0082']
cmap_gndr = dict(zip(df["Gender"].unique(),solar_colors[:2]))
cmap_gndr

{'Female': '#FFDF00', 'Male': '#FF4D00'}

In [521]:
#Sankey chart aka Parallel Categories; drafting ideas to visualize data interactions and dependencies
#The colors highlight the inheritance of the categories within the "Product line" column.
filtered = df[df["Date"].dt.month == 2]
fig = px.parallel_categories(filtered, 
                             dimensions=['Gender', 'Product line', 'City'],
                             color=filtered['Product line'].map(cmap_prod), 
                             )
fig.show()

In [522]:
#Sankey chart aka Parallel Categories; drafting ideas to visualize data interactions and dependencies
#The colors highlight the inheritance of the categories within the "Gender" columns. 
filtered = df[df["Date"].dt.month == 2].round()
fig = px.parallel_categories(filtered, 
                             dimensions=['Product line', 'Gender', 'City',"Rating"],
                             color=filtered['Gender'].map(cmap_gndr), 
                             )
fig.show()

In [523]:
#Sankey chart aka Parallel Categories; drafting ideas to visualize data interactions and dependencies
#The colors highlight the inheritance of the categories within the "Rating" column. 
#The rating was discretized/rounded to whole nunmbers
 
filtered = df[df["Date"].dt.month == 2].round()
fig = px.parallel_categories(filtered, 
                             dimensions=['City',"Rating"],
                             color=filtered['Rating'], 
                             color_continuous_scale=px.colors.sequential.Inferno,
                             )
fig.show()

In [524]:
#Example of a Gender vs Rating histogram
fig = px.histogram(df[df["Date"].dt.month == 2], x="Rating", color="Gender", marginal="rug")
fig.show()

In [525]:
#Example of a Gender vs Rating as probability density - histogram
colors = ['#835AF1', '#7FA6EE', '#B8F7D4']
color_map = {
    'Female': '#835AF1',
    'Male': '#B8F7D4'
            }

fig = px.histogram(df, 
                   x="Rating", 
                   color = "Gender",
                   histnorm='probability density',
                   color_discrete_map=color_map,
                   marginal="rug", 
                   nbins=20
                   )
#ig.update_layout(bargap=0.05)
fig

In [526]:
#Data hierarchy - Sunburst Gender > City > Product line
filtered = df[df["Date"].dt.month == 2].round()
fig = px.sunburst(
    filtered,
    path=['Gender',"City",'Product line',],
    values='Total',
)
fig

In [527]:
#Filtered data by Month: February, City: Yangon
#Shows how the data is being filtered within the callbacks and the functions
df[df['Date'].dt.month_name() == "February"][df["City"] == "Yangon"].head()


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.21,634.38,2019-02-08,10:37,Ewallet,604.17,4.76,30.21,5.3
6,355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.65,433.69,2019-02-25,14:36,Ewallet,413.04,4.76,20.65,5.8
12,365-64-0515,A,Yangon,Normal,Female,Electronic accessories,46.95,5,11.74,246.49,2019-02-12,10:25,Ewallet,234.75,4.76,11.74,7.1
13,252-56-2699,A,Yangon,Normal,Male,Food and beverages,43.19,10,21.6,453.5,2019-02-07,16:48,Ewallet,431.9,4.76,21.6,8.2
23,636-48-8204,A,Yangon,Normal,Male,Electronic accessories,34.56,5,8.64,181.44,2019-02-17,11:15,Ewallet,172.8,4.76,8.64,9.9


In [528]:
#Agregating data by Month: February and total Number of Customers per Day
#Shows how the data is being filtered within the callbacks and the functions
dff = df[df['Date'].dt.month_name() == "February"][df["City"] == "Yangon"]
dffs = dff.groupby('Date').agg(Customers_Day=('Customer type', 'count'))
dffs.head()


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0_level_0,Customers_Day
Date,Unnamed: 1_level_1
2019-02-01,3
2019-02-02,4
2019-02-03,5
2019-02-04,5
2019-02-05,5


In [529]:
#Shows how "reset_index" modify the Df 
dffs.reset_index().head()

Unnamed: 0,Date,Customers_Day
0,2019-02-01,3
1,2019-02-02,4
2,2019-02-03,5
3,2019-02-04,5
4,2019-02-05,5


In [530]:
#Plotting agregated data: Daily Customers in February - Yangon City
fig = px.bar(df[df["Date"].dt.month == 2][df["City"] == "Yangon"].groupby('Date').agg(Customers_per_Day=('Customer type', 'count')).reset_index(),
             x="Date",
             y="Customers_per_Day",
             #text = "Customer type",
             barmode='group',
             #text_auto=".2s"
             )
fig.show()


Boolean Series key will be reindexed to match DataFrame index.



In [531]:
#Agregating data by {Month: February} and total Sales per Day
#Shows how the data is being filtered within the callbacks and the functions
df[df["Date"].dt.month_name() == "February"][df["City"] == "Yangon"].groupby('Date').agg(Sales_per_Day=('Total', 'sum')).reset_index().head()


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Date,Sales_per_Day
0,2019-02-01,911.94
1,2019-02-02,1019.96
2,2019-02-03,2717.36
3,2019-02-04,1239.39
4,2019-02-05,568.43


In [532]:
#Plotting agregated data: Daily Sales in February - Yangon City
dffs = df[df["Date"].dt.month_name() == "February"][df["City"] == "Yangon"].groupby('Date').agg(Sales=('Total', 'sum')).reset_index()
fig = px.bar(dffs,
             x="Date",
             y="Sales",
             barmode='group',
             #template = "flatly"
             )
fig.show()


Boolean Series key will be reindexed to match DataFrame index.



In [533]:
#Agregating data by {Month: February} and {Total: Sales, Gross_Income & Cogs}
#Shows how the data is being filtered within the callbacks, functions and plots.
#Note that: Sales == COGS + Gross_Income
dffs = df[df["Date"].dt.month_name() == "February"][df["City"] == "Yangon"].groupby('Date').agg(Sales=('Total', 'sum'),
                                                                                                Gross_Income = ('gross income', 'sum'),
                                                                                                COGS = ('cogs', 'sum') ).reset_index()
dffs.head()


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Date,Sales,Gross_Income,COGS
0,2019-02-01,911.94,43.43,868.51
1,2019-02-02,1019.96,48.57,971.39
2,2019-02-03,2717.36,129.4,2587.96
3,2019-02-04,1239.39,59.02,1180.37
4,2019-02-05,568.43,27.06,541.36


In [534]:
#Simple barplot: Sales - February - Yangon
fig1 = px.bar(dffs,
             x="Date",
             y=["Sales"],
             barmode='group',
             #template = "flatly"
             )
fig1

In [535]:
#Simple barplot: COGS & Gross_Income - February - Yangon
color_map = {
    'Gross_Income': 'green',
    'COGS': 'orange'
            }   

fig2 = px.bar(dffs,
             x="Date",
             y=["Gross_Income","COGS"],
             barmode='stack',
             color_discrete_map=color_map
             )
fig2

In [536]:
#Combining the previous barplots; solution taken from: https://stackoverflow.com/questions/70563166/stacked-barplot-in-plotly
fig3 = go.Figure(
            data=[
                go.Bar(
                    name="Sales",
                    x=dffs["Date"],
                    y=dffs["Sales"],
                    offsetgroup=0,
                ),
                go.Bar(
                    name="COGS",
                    x=dffs["Date"],
                    y=dffs["COGS"],
                    offsetgroup=1,
                ),
                go.Bar(
                    name="Income",
                    x=dffs["Date"],
                    y=dffs["Gross_Income"],
                    offsetgroup=1,
                    base=dffs["COGS"], 
                )
                ],
                layout=go.Layout(
                    title="Sales, COGS & Gross Income",
                    yaxis_title="Value"
                )
                )
fig3.show()

#Unfortunately this breaks the dash app when trying to render it  ))))))))))))))):

In [537]:
#Agregating data by {Month: February} and total Sales per Day by {Product line: Category}
#Shows how the data is being filtered within the callbacks and the functions
dffs = df[df["Date"].dt.month_name() == "February"][df["City"] == "Yangon"].groupby(['Date',"Product line"]).agg(Sales=('Total', 'sum'),
                                                                                                #Product_type = ('Product line', 'count')
                                                                                                ).reset_index()
dffs.head()


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Date,Product line,Sales
0,2019-02-01,Food and beverages,74.71
1,2019-02-01,Health and beauty,326.26
2,2019-02-01,Sports and travel,510.97
3,2019-02-02,Fashion accessories,452.86
4,2019-02-02,Food and beverages,115.08


In [538]:
#Plotting agregated data: Daily Sales by {Product line} in {February} - {Yangon City}
fig = px.bar(dffs,
             x="Date",
             y="Sales",
             color= "Product line",
             barmode='stack',
             )
fig

In [539]:
#Total gross income for February in Mandalay...
dff = df[df["Date"].dt.month_name() == "February"][df["City"] == "Mandalay"]
dff["gross income"].sum().round()



Boolean Series key will be reindexed to match DataFrame index.



1639.0

In [540]:
#Month = February, City = Mandalay
dffs = dff.groupby(['Gender']).agg(Genders=('Gender', 'count')).reset_index()
dffs   

Unnamed: 0,Gender,Genders
0,Female,55
1,Male,54


In [541]:
#Example of a simple pie chart by Genders in Mandalay - February
fig = px.pie(dffs,
             values= "Genders",
             names="Gender",
             title='Genders in Mandalay, February',
             hole=.3)
fig.update_traces(textposition='inside',
                  textinfo='percent+value')

fig 

In [542]:
#Grouping by Month = February, City = Mandalay, Pyament Type: Count
dffs = dff.groupby(['Payment']).agg(Type=('Payment', 'count')).reset_index()
dffs   

Unnamed: 0,Payment,Type
0,Cash,44
1,Credit card,30
2,Ewallet,35


In [543]:
#Example of a simple pie chart of Payment Type in Mandalay - February
fig = px.pie(dffs,
             values= "Type",
             names="Payment",
             title='Type of Payment, Mandalay - February',
             hole=.3)
fig.update_traces(textposition='inside',
                  textinfo='percent+value')

fig 

In [544]:
dffs = dff.groupby(['Customer type']).agg(Type_c=('Customer type', 'count')).reset_index()
dffs["Type"] = "Type"
dffs

Unnamed: 0,Customer type,Type_c,Type
0,Member,56,Type
1,Normal,53,Type


In [545]:
#Agregating data by Customer Type. Computing text: Count + Relative % for each group (to show as label in a plot)
total_count = dffs['Type_c'].sum()
dffs['Percentage'] = (dffs['Type_c'] / total_count) * 100
dffs['Percentage'] = dffs['Percentage'].round(2).astype(str) + '%'
dffs["Text"] = dffs['Type_c'].astype(str) + ' (' + dffs['Percentage'] + ')'
dffs

Unnamed: 0,Customer type,Type_c,Type,Percentage,Text
0,Member,56,Type,51.38%,56 (51.38%)
1,Normal,53,Type,48.62%,53 (48.62%)


In [546]:
fig = px.bar(dffs,
             y="Type",
             x="Type_c",
             color="Customer type",
             orientation= "h",
             text = "Text",
             barmode='stack',
             )
fig.update_layout(title='Customer Type in February')
fig.update_xaxes(title='')
fig.update_yaxes(title='')
fig

In [547]:
#Categorizing data "Sunburst Plot" path: {Gender > Pyment type}, in {Yangon City - Month: February}
dff
fig = px.sunburst(
    dff,
    path=['Gender',"Payment"],
    values='Total',
)
fig

In [548]:
#Categorizing data "Treemap Plot", path: {Pyment type > Gender}, in {Yangon City - Month: February}
fig = px.treemap(dff, 
                 path=["Payment",'Gender'], 
                 values='Total')
fig

In [549]:
#Categorizing data "Treemap Plot", path: {Product line}, in {Yangon City - Month: February}
fig = px.treemap(dff, 
                 path=["Product line"], 
                 values='Total',
                 )
fig.update_traces(textinfo='label+value')
fig

In [550]:
# Drafting ideas on how to encode the data for a prediction model, and to utilize in more complex plots (that uses code numbers to
# plot categorical values)
# Define mapping for categories to numerical values within the range(-3, 4)
category_mapping = {'Yangon': -1, 
                    'Naypyitaw': 0,
                    'Mandalay': 1, 
                    'Female': 5, 
                    'Male': -1, 
                    'Health and beauty': -2, 
                    'Electronic accessories':-1,
                    'Home and lifestyle':0 ,
                    'Sports and travel':1 ,
                    'Food and beverages': 2 ,
                    'Fashion accessories': 3}

# Apply mapping to categorical columns
mapped_data = pd.DataFrame()
dff = df.copy()
for column in ["City","Gender","Product line"]:
    new_column_name = column + '_map'
    mapped_data[new_column_name] = dff[column].map(category_mapping)
    mapped_data[column] = dff[column]
    
mapped_data["Total"] = dff["Total"]
mapped_data["Rating"] = dff["Rating"]


mapped_data

Unnamed: 0,City_map,City,Gender_map,Gender,Product line_map,Product line,Total,Rating
0,-1,Yangon,5,Female,-2,Health and beauty,548.97,9.1
1,0,Naypyitaw,5,Female,-1,Electronic accessories,80.22,9.6
2,-1,Yangon,-1,Male,0,Home and lifestyle,340.53,7.4
3,-1,Yangon,-1,Male,-2,Health and beauty,489.05,8.4
4,-1,Yangon,-1,Male,1,Sports and travel,634.38,5.3
...,...,...,...,...,...,...,...,...
995,0,Naypyitaw,-1,Male,-2,Health and beauty,42.37,6.2
996,1,Mandalay,5,Female,0,Home and lifestyle,1022.49,4.4
997,-1,Yangon,-1,Male,2,Food and beverages,33.43,7.7
998,-1,Yangon,-1,Male,0,Home and lifestyle,69.11,4.1


In [551]:
#Creating a column with the hour of the day form "Time" column
mapped_data["Time_hour"] = df['Time'].str.split(':').str[0].astype(int)
mapped_data.head()

Unnamed: 0,City_map,City,Gender_map,Gender,Product line_map,Product line,Total,Rating,Time_hour
0,-1,Yangon,5,Female,-2,Health and beauty,548.97,9.1,13
1,0,Naypyitaw,5,Female,-1,Electronic accessories,80.22,9.6,10
2,-1,Yangon,-1,Male,0,Home and lifestyle,340.53,7.4,13
3,-1,Yangon,-1,Male,-2,Health and beauty,489.05,8.4,20
4,-1,Yangon,-1,Male,1,Sports and travel,634.38,5.3,10


In [552]:
#Drafting idea: How to show hierarchy, inheritence and relationship of the data
#Idea {Parallel_coordinates}: by Time of the Day of purchase vs Rating vs City of the Purchase.
#Color gradient by {Time_hour}: Does the time of day have infuence over ratings? Is the same for all cities?
 
fig = px.parallel_coordinates(mapped_data, color="Time_hour",
                              dimensions=["Time_hour","Rating","City_map"],
                              color_continuous_scale=px.colors.diverging.delta_r,
                              color_continuous_midpoint= 15)
fig

In [553]:
#Drafting idea: How to show hierarchy, inheritence and relationship of the data
#Idea {treemap}: path {Product line > City > Gender}
#Does gender, city or product line infuence over user's ratings? 
#Note: Ratings is being agreegated by {sum} and NOT {average}; this is just a draft idea.
fig = px.treemap(dff, 
                 path=["Product line","City","Gender"], 
                 values='Rating',
                 )
fig.update_traces(textinfo='label+value')

In [554]:
#Creating a new column to simplifiy analysis; instead of individual hours I agregated it to turn shitfs: Morning, Evening and Night
dff = df.copy()

# Convert 'Date' column to datetime format
dff['Date'] = pd.to_datetime(dff['Date'])

# Extract day of the week from 'Date' column
dff['Date'] = dff['Date'].dt.day_name()

# Function to categorize hours into Morning, Evening, and Night
def categorize_hour(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Evening'
    else:
        return 'Night'
    
# Convert 'Hour' column to datetime format and extract hour
dff['Time'] = pd.to_datetime(dff['Time'], format='%H:%M').dt.hour

# Apply categorization function to 'Hour' column
dff['Time'] = dff['Time'].apply(categorize_hour)

dff.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.14,548.97,Saturday,Evening,Ewallet,522.83,4.76,26.14,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,Friday,Morning,Cash,76.4,4.76,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.22,340.53,Sunday,Evening,Credit card,324.31,4.76,16.22,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.29,489.05,Sunday,Night,Ewallet,465.76,4.76,23.29,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.21,634.38,Friday,Morning,Ewallet,604.17,4.76,30.21,5.3


In [555]:
#Perform one-hot encoding for categorical columns
#Name of columns with relevant information: ['Date', 'Time', 'Gender', 'Product line', 'City', 'Customer type', 'Payment']
dummies_cols = ['Date', 'Time', 'Gender', 'Product line', 'City', 'Customer type', 'Payment']
dffs = pd.get_dummies(dff, columns= dummies_cols)

dffs.head()

Unnamed: 0,Invoice ID,Branch,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating,...,Product line_Home and lifestyle,Product line_Sports and travel,City_Mandalay,City_Naypyitaw,City_Yangon,Customer type_Member,Customer type_Normal,Payment_Cash,Payment_Credit card,Payment_Ewallet
0,750-67-8428,A,74.69,7,26.14,548.97,522.83,4.76,26.14,9.1,...,False,False,False,False,True,True,False,False,False,True
1,226-31-3081,C,15.28,5,3.82,80.22,76.4,4.76,3.82,9.6,...,False,False,False,True,False,False,True,True,False,False
2,631-41-3108,A,46.33,7,16.22,340.53,324.31,4.76,16.22,7.4,...,True,False,False,False,True,False,True,False,True,False
3,123-19-1176,A,58.22,8,23.29,489.05,465.76,4.76,23.29,8.4,...,False,False,False,False,True,True,False,False,False,True
4,373-73-7910,A,86.31,7,30.21,634.38,604.17,4.76,30.21,5.3,...,False,True,False,False,True,False,True,False,False,True


In [556]:
#Showing only the one-hot encoding. Total columns from original Df is 17; one hot encoding removes the original column, hence the formula
#17-len(dummies_cols) is used 
dffs.iloc[:,17-len(dummies_cols):]

Unnamed: 0,Date_Friday,Date_Monday,Date_Saturday,Date_Sunday,Date_Thursday,Date_Tuesday,Date_Wednesday,Time_Evening,Time_Morning,Time_Night,...,Product line_Home and lifestyle,Product line_Sports and travel,City_Mandalay,City_Naypyitaw,City_Yangon,Customer type_Member,Customer type_Normal,Payment_Cash,Payment_Credit card,Payment_Ewallet
0,False,False,True,False,False,False,False,True,False,False,...,False,False,False,False,True,True,False,False,False,True
1,True,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,True,True,False,False
2,False,False,False,True,False,False,False,True,False,False,...,True,False,False,False,True,False,True,False,True,False
3,False,False,False,True,False,False,False,False,False,True,...,False,False,False,False,True,True,False,False,False,True
4,True,False,False,False,False,False,False,False,True,False,...,False,True,False,False,True,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,True
996,False,False,True,False,False,False,False,True,False,False,...,True,False,True,False,False,False,True,False,False,True
997,False,False,True,False,False,False,False,True,False,False,...,False,False,False,False,True,True,False,True,False,False
998,True,False,False,False,False,False,False,True,False,False,...,True,False,False,False,True,False,True,True,False,False


In [557]:
def data_encoding(df,_columns, target_value):
    #Columns: 'Day_of_week', 'Hour', 'Gender', 'Product line', 'City', 'Customer type', 'Payment'
    dff = df.copy()

    if 'Date' in _columns:
        # Extract day of the week from 'Date' column
        dff['Date'] = df['Date'].dt.day_name()
    
    if "Time" in _columns:
        # Convert 'Hour' column to datetime format and extract hour
        dff['Time'] = pd.to_datetime(dff['Time'], format='%H:%M').dt.hour

        # Apply categorization function to 'Hour' column
        dff['Time'] = dff['Time'].apply(categorize_hour)
    
    # Perform one-hot encoding for categorical columns
    dff = pd.get_dummies(dff, columns= _columns)
       
    #Simplify
    dff = dff.iloc[:,17-len(_columns):]
    dff[target_value] = df[target_value]
    
    return dff

data_encoding(df,['Product line', 'Customer type',"City"], "Total").head()

Unnamed: 0,Product line_Electronic accessories,Product line_Fashion accessories,Product line_Food and beverages,Product line_Health and beauty,Product line_Home and lifestyle,Product line_Sports and travel,Customer type_Member,Customer type_Normal,City_Mandalay,City_Naypyitaw,City_Yangon,Total
0,False,False,False,True,False,False,True,False,False,False,True,548.97
1,True,False,False,False,False,False,False,True,False,True,False,80.22
2,False,False,False,False,True,False,False,True,False,False,True,340.53
3,False,False,False,True,False,False,True,False,False,False,True,489.05
4,False,False,False,False,False,True,False,True,False,False,True,634.38


In [558]:
# Scatter plot Sales vs Rating; Category color: Gender
#Is there any pattern on sales/rating based on gender?
fig = px.scatter(df, x='Total', y='Rating', color='Gender',marginal_y='histogram', marginal_x='histogram')

# Showing the plot
fig.show()

In [559]:
#Drafting Ideas: Density plots by Gender of Sales vs Rating
#Is there any pattern on sales/rating based on gender?

# Create separate plots for each gender
# Create separate plots for each gender
for gender_value in df['Gender'].unique():
    filtered_df = df[df['Gender'] == gender_value]
    
    fig = go.Figure()
    fig.add_trace(go.Histogram2dContour(
            x=filtered_df['Total'],
            y=filtered_df['Rating'],
            colorscale='RdPu',  # Pink/Purple/Red colorscale
            reversescale=True,
            xaxis='x',
            yaxis='y'
        ))
    fig.add_trace(go.Scatter(
            x=filtered_df['Total'],
            y=filtered_df['Rating'],
            xaxis='x',
            yaxis='y',
            mode='markers',
            marker=dict(
                color='rgba(0,0,0,0.3)',  # Adjusting the transparency
                size=3
            )
        ))
    fig.add_trace(go.Histogram(
            y=filtered_df['Rating'],
            xaxis='x2',
            marker=dict(
                color='rgba(128,0,128,0.8)'  # Purple color with transparency 0.8
            )
        ))
    fig.add_trace(go.Histogram(
            x=filtered_df['Total'],
            yaxis='y2',
            marker=dict(
                color='rgba(128,0,128,0.8)'  # Purple color with transparency 0.8
            )
        ))

    fig.update_layout(
        autosize=False,
        xaxis=dict(
            zeroline=False,
            domain=[0, 0.85],
            showgrid=False
        ),
        yaxis=dict(
            zeroline=False,
            domain=[0, 0.85],
            showgrid=False
        ),
        xaxis2=dict(
            zeroline=False,
            domain=[0.85, 1],
            showgrid=False
        ),
        yaxis2=dict(
            zeroline=False,
            domain=[0.85, 1],
            showgrid=False
        ),
        height=600,
        width=600,
        bargap=0,
        hovermode='closest',
        showlegend=False,
        title=f'Gender: {gender_value}'
    )

    fig.show()

In [560]:
# Feature importance analisis 
# Splitting the data into features (X) and target variable (y)
X = dffs[['Date_Friday', 'Date_Monday', 'Date_Saturday', 'Date_Sunday',
       'Date_Thursday', 'Date_Tuesday', 'Date_Wednesday', 'Time_Evening',
       'Time_Morning', 'Time_Night', 'Product line_Electronic accessories',
       'Product line_Fashion accessories', 'Product line_Food and beverages',
       'Product line_Health and beauty', 'Product line_Home and lifestyle',
       'Product line_Sports and travel', 'Customer type_Member',
       'Customer type_Normal', 'Payment_Cash', 'Payment_Credit card',
       'Payment_Ewallet']]  # Features
y = dffs['Rating']  # Target variable

# Create a random forest regressor object
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to your data
rf_regressor.fit(X, y)

# Get feature importances
feature_importances = rf_regressor.feature_importances_

# Create a dataframe to store feature importances along with their corresponding column names
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the dataframe by feature importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=True)

# Plotting feature importances using Plotly
fig = go.Figure()

fig.add_trace(go.Bar(
    y=feature_importance_df['Feature'],
    x=feature_importance_df['Importance'],
    orientation='h',
    marker=dict(color='skyblue'),
))

fig.update_layout(
    title='Feature Importance Analysis',
    xaxis_title='Feature Importance',
    yaxis_title='',
)

fig.show()

# Optionally, you can print the feature importance dataframe
print(feature_importance_df)

                                Feature  Importance
12      Product line_Food and beverages    0.035509
6                        Date_Wednesday    0.036963
4                         Date_Thursday    0.039515
1                           Date_Monday    0.040695
3                           Date_Sunday    0.040822
5                          Date_Tuesday    0.041103
2                         Date_Saturday    0.041808
0                           Date_Friday    0.043066
11     Product line_Fashion accessories    0.046466
15       Product line_Sports and travel    0.046935
14      Product line_Home and lifestyle    0.047249
16                 Customer type_Member    0.048153
10  Product line_Electronic accessories    0.050434
17                 Customer type_Normal    0.051773
13       Product line_Health and beauty    0.052471
20                      Payment_Ewallet    0.052650
8                          Time_Morning    0.052704
19                  Payment_Credit card    0.055371
18          

In [562]:
# Plot the learning curve
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=train_sizes, y=train_scores_mean,
    mode='lines+markers',
    name='Training score',
    line=dict(color='#18BC9C'),
    hovertemplate='Training Error %{y}<br> Num.Samples: %{x:.0f}<extra></extra>'
))

fig.add_trace(go.Scatter(
    x=train_sizes, y=test_scores_mean,
    mode='lines+markers',
    name='Cross-validation score',
    line=dict(color='#3498DB'),
    hovertemplate='CrossVal Error %{y}<br> Samples: %{x:.0f}<extra></extra>'
))

# Fill between the upper and lower bounds of the scores
fig.add_trace(go.Scatter(
    x=train_sizes, y=train_scores_mean + train_scores_std,
    mode='lines',
    line=dict(width=0),
    name="",
    hovertemplate='<extra></extra>',
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=train_sizes, y=train_scores_mean - train_scores_std,
    mode='lines',
    line=dict(width=0),
    name="",
    hovertemplate='<extra></extra>',
    fill='tonexty',
    showlegend=False,
    fillcolor='rgba(24, 188, 156, 0.2)'
))

fig.add_trace(go.Scatter(
    x=train_sizes, y=test_scores_mean + test_scores_std,
    mode='lines',
    name="",
    hovertemplate='<extra></extra>',
    line=dict(width=0),
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=train_sizes, y=test_scores_mean - test_scores_std,
    mode='lines',
    line=dict(width=0),
    fill='tonexty',
    name="",
    hovertemplate='<extra></extra>',
    showlegend=False,
    fillcolor='rgba(52, 152, 219, 0.2)'
))

fig.update_layout(
    title={'text': '<i>Learning Curve</i>', 'font': {'size': 15, "color":"#34495E"}},
    yaxis_title='Mean Squared Error',
    xaxis_title='Sample Size',
    legend=dict(orientation='h', yanchor='bottom', xanchor='right',y=1.02, x=1),
    showlegend=True
                )
fig.show()


In [563]:
# Plot the validation curve
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=param_range, y=train_scores_mean,
    mode='lines+markers',
    name='Training score',
    line=dict(color='#669BBC'),
    hovertemplate='Error training %{y}<br> Samples: %{x:.0f}<extra></extra>'
))

fig.add_trace(go.Scatter(
    x=param_range, y=test_scores_mean,
    mode='lines+markers',
    name='Cross-validation score',
    line=dict(color='#F39C12'),
    hovertemplate='Error CV %{y}<br> Samples: %{x:.0f}<extra></extra>'
))

# Fill between the upper and lower bounds of the scores
fig.add_trace(go.Scatter(
    x=param_range, y=train_scores_mean + train_scores_std,
    mode='lines',
    name="",
    hovertemplate='<extra></extra>',
    line=dict(width=0),
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=param_range, y=train_scores_mean - train_scores_std,
    mode='lines',
    line=dict(width=0),
    name="",
    hovertemplate='<extra></extra>',
    fill='tonexty',
    showlegend=False,
    fillcolor='rgba(102, 155, 188, 0.2)'
))

fig.add_trace(go.Scatter(
    x=param_range, y=test_scores_mean + test_scores_std,
    mode='lines',
    name="",
    hovertemplate='<extra></extra>',
    line=dict(width=0),
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=param_range, y=test_scores_mean - test_scores_std,
    mode='lines',
    line=dict(width=0),
    name="",
    hovertemplate='<extra></extra>',
    fill='tonexty',
    showlegend=False,
    fillcolor='rgba(243, 156, 18, 0.2)'
))

fig.update_layout(
    title={'text': '<i>Validation Curve</i>', 'font': {'size': 15, "color":"#34495E"}},
    xaxis_title='<i>Number of Estimators</i>',
    xaxis_title_font=dict(size=10),
    yaxis_title='Mean Squared Error',
    legend=dict(orientation='h', yanchor='bottom', xanchor='right',y=1.02, x=1),
    showlegend=True
                )


fig.show()

In [569]:
#How good is the model?
model = rf_regressor
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Create a DataFrame with the actual and predicted values
results = pd.DataFrame({'y_real': y_test, 'y_pred': y_pred})

# Sort the DataFrame by the actual values
results = results.sort_values(by='y_real').reset_index(drop=True)

# Plot using Plotly
fig = go.Figure()

# Add the actual values trace
fig.add_trace(go.Scatter(x=list(range(len(results))),
                         y=results['y_real'],
                         mode='lines',
                         name='Actual Values'))

# Add the predicted values trace
fig.add_trace(go.Scatter(x=list(range(len(results))),
                         y=results['y_pred'],
                         mode='lines',
                         name='Predicted Values'))

# Update layout
fig.update_layout(title='Actual vs Predicted Values',
                  xaxis_title='Index',
                  yaxis_title='Value',
                  legend_title='Legend')

fig.show()