### Import and Configuration

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import pathlib as pl

pd.reset_option("display.max_rows", None)

FIGURES_DIR = pl.Path.cwd()/ "figures"

### Data Loading

In [97]:
def load_data():
    """Load the Data"""
    CWD = pl.Path.cwd()
    global DATA_DIR
    DATA_DIR = CWD / "Data"
    DATA_PATH = DATA_DIR / "Raw"/ "Business_sales_EDA.csv"
    global df
    df = pd.read_csv(DATA_PATH, sep = ";")  # `sep` is same as `delimiter`
    return df

In [98]:
load_data()

Unnamed: 0,Product ID,Product Position,Promotion,Product Category,Seasonal,Sales Volume,brand,url,name,description,price,currency,terms,section,season,material,origin
0,185102,Aisle,Yes,clothing,Yes,1243,Zara,https://www.zara.com/us/en/basic-puffer-jacket...,BASIC PUFFER JACKET,Puffer jacket made of tear-resistant ripstop f...,78.99,USD,jackets,MAN,Winter,Polyester,Brazil
1,188771,Aisle,Yes,clothing,No,1429,Zara,https://www.zara.com/us/en/tuxedo-jacket-p0889...,TUXEDO JACKET,Straight fit blazer. Pointed lapel collar and ...,14.99,USD,jackets,MAN,Autumn,Cotton,Turkey
2,180176,End-cap,Yes,clothing,Yes,1168,Zara,https://www.zara.com/us/en/slim-fit-suit-jacke...,SLIM FIT SUIT JACKET,Slim fit jacket. Notched lapel collar. Long sl...,71.95,USD,jackets,WOMAN,Autumn,Polyester,Morocco
3,112917,Aisle,Yes,clothing,No,1348,Zara,https://www.zara.com/us/en/stretch-suit-jacket...,STRETCH SUIT JACKET,Slim fit jacket made of viscose blend fabric. ...,30.99,USD,jackets,MAN,Spring,Polyester,China
4,192936,End-cap,Yes,clothing,Yes,1602,Zara,https://www.zara.com/us/en/double-faced-jacket...,DOUBLE FACED JACKET,Jacket made of faux leather faux shearling wit...,22.99,USD,jackets,WOMAN,Winter,Wool Blend,China
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20247,219627,Front of Store,Yes,clothing,No,1754,Zara,https://www.zara.com/us/en/suit-jacket-in-100-...,CROPPED WASHED T-SHIRT CHARCOAL,Running shoes. Upper in a combination of piece...,31.95,USD,jeans,WOMAN,Summer,Linen Blend,India
20248,219628,Aisle,No,clothing,No,872,Zara,https://www.zara.com/us/en/fleece-overshirt-p0...,SATIN WOVEN LEATHER SLIDES STONE,Slim fit shirt. Round neck and short sleeves.,49.99,USD,jackets,WOMAN,Spring,Linen,China
20249,219629,Aisle,Yes,clothing,No,1360,Zara,https://www.zara.com/us/en/faux-suede-patch-ja...,RELAXED CROPPED LEATHER JACKET CHARCOAL,Ankle boots. Made of leather with a suede fini...,20.99,USD,shoes,WOMAN,Spring,Polyester,China
20250,219630,Aisle,No,clothing,No,892,Zara,https://www.zara.com/us/en/fine-knit-crop-swea...,SLIM BASIC 100% WOOL SWEATER BURGUNDY,RETRO SNEAKERS,64.95,USD,jackets,WOMAN,Winter,Polyester,Spain


### Data Cleaning & Preprocessing

In [99]:
def process_data():
    """Clean data"""
    global df
    df = df.drop(columns = ["url","brand", "Product Category", "currency"])
    df.isna().sum()
    df = df.dropna()
    df.columns = ["Product ID",	"Product Position",	"Promotion", "Seasonal", "Sales Volume", "Name", "Description", "Price", "Products", "Section", "Season", "Material", "Manufacturing Country"]
    df["Sales Volume"] = df["Sales Volume"].astype("uint32")
    df[["Product Position","Promotion", "Seasonal", "Name", "Description", "Products", "Section", "Season", "Material", "Manufacturing Country"]] = df[["Product Position","Promotion", "Seasonal", "Name", "Description", "Products", "Section", "Season", "Material", "Manufacturing Country"]].astype("string")
    return df

In [100]:
process_data()

Unnamed: 0,Product ID,Product Position,Promotion,Seasonal,Sales Volume,Name,Description,Price,Products,Section,Season,Material,Manufacturing Country
0,185102,Aisle,Yes,Yes,1243,BASIC PUFFER JACKET,Puffer jacket made of tear-resistant ripstop f...,78.99,jackets,MAN,Winter,Polyester,Brazil
1,188771,Aisle,Yes,No,1429,TUXEDO JACKET,Straight fit blazer. Pointed lapel collar and ...,14.99,jackets,MAN,Autumn,Cotton,Turkey
2,180176,End-cap,Yes,Yes,1168,SLIM FIT SUIT JACKET,Slim fit jacket. Notched lapel collar. Long sl...,71.95,jackets,WOMAN,Autumn,Polyester,Morocco
3,112917,Aisle,Yes,No,1348,STRETCH SUIT JACKET,Slim fit jacket made of viscose blend fabric. ...,30.99,jackets,MAN,Spring,Polyester,China
4,192936,End-cap,Yes,Yes,1602,DOUBLE FACED JACKET,Jacket made of faux leather faux shearling wit...,22.99,jackets,WOMAN,Winter,Wool Blend,China
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20247,219627,Front of Store,Yes,No,1754,CROPPED WASHED T-SHIRT CHARCOAL,Running shoes. Upper in a combination of piece...,31.95,jeans,WOMAN,Summer,Linen Blend,India
20248,219628,Aisle,No,No,872,SATIN WOVEN LEATHER SLIDES STONE,Slim fit shirt. Round neck and short sleeves.,49.99,jackets,WOMAN,Spring,Linen,China
20249,219629,Aisle,Yes,No,1360,RELAXED CROPPED LEATHER JACKET CHARCOAL,Ankle boots. Made of leather with a suede fini...,20.99,shoes,WOMAN,Spring,Polyester,China
20250,219630,Aisle,No,No,892,SLIM BASIC 100% WOOL SWEATER BURGUNDY,RETRO SNEAKERS,64.95,jackets,WOMAN,Winter,Polyester,Spain


#### Feature Engineer

In [101]:

def add_features():
    """Add specific features to the data"""
    global df
    df["Revenue"] = df["Sales Volume"] * df["Price"]    # New column revenue
    df["Price Segment"] = pd.cut(x = df["Price"], bins = [0, 40, 80, 140], labels = ["Low", "Mid", "High"])  # Segmentation of Price
    df["Price Segment"] = df["Price Segment"].astype(str)   # Coversion of Price Segment to string
    df.to_csv(DATA_DIR/"Processed"/ "Business_sales_proc.csv")  # Saving the processed data to filepath
    return df

In [102]:
add_features()

Unnamed: 0,Product ID,Product Position,Promotion,Seasonal,Sales Volume,Name,Description,Price,Products,Section,Season,Material,Manufacturing Country,Revenue,Price Segment
0,185102,Aisle,Yes,Yes,1243,BASIC PUFFER JACKET,Puffer jacket made of tear-resistant ripstop f...,78.99,jackets,MAN,Winter,Polyester,Brazil,98184.57,Mid
1,188771,Aisle,Yes,No,1429,TUXEDO JACKET,Straight fit blazer. Pointed lapel collar and ...,14.99,jackets,MAN,Autumn,Cotton,Turkey,21420.71,Low
2,180176,End-cap,Yes,Yes,1168,SLIM FIT SUIT JACKET,Slim fit jacket. Notched lapel collar. Long sl...,71.95,jackets,WOMAN,Autumn,Polyester,Morocco,84037.60,Mid
3,112917,Aisle,Yes,No,1348,STRETCH SUIT JACKET,Slim fit jacket made of viscose blend fabric. ...,30.99,jackets,MAN,Spring,Polyester,China,41774.52,Low
4,192936,End-cap,Yes,Yes,1602,DOUBLE FACED JACKET,Jacket made of faux leather faux shearling wit...,22.99,jackets,WOMAN,Winter,Wool Blend,China,36829.98,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20247,219627,Front of Store,Yes,No,1754,CROPPED WASHED T-SHIRT CHARCOAL,Running shoes. Upper in a combination of piece...,31.95,jeans,WOMAN,Summer,Linen Blend,India,56040.30,Low
20248,219628,Aisle,No,No,872,SATIN WOVEN LEATHER SLIDES STONE,Slim fit shirt. Round neck and short sleeves.,49.99,jackets,WOMAN,Spring,Linen,China,43591.28,Mid
20249,219629,Aisle,Yes,No,1360,RELAXED CROPPED LEATHER JACKET CHARCOAL,Ankle boots. Made of leather with a suede fini...,20.99,shoes,WOMAN,Spring,Polyester,China,28546.40,Low
20250,219630,Aisle,No,No,892,SLIM BASIC 100% WOOL SWEATER BURGUNDY,RETRO SNEAKERS,64.95,jackets,WOMAN,Winter,Polyester,Spain,57935.40,Mid


### Exploratory Data Analysis

In [103]:
def percent(x,y,z) -> float:
    """Percent formula"""
    # Convert to float to avoid overflow in subtraction with unsigned integers
    x = float(x)
    y = float(y)
    z = float(z)
    formula = f"{np.round((((x) - (y))/(z))*100, decimals = 1)}%"
    return formula

In [104]:
# Basic information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20250 entries, 0 to 20251
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Product ID             20250 non-null  int64  
 1   Product Position       20250 non-null  string 
 2   Promotion              20250 non-null  string 
 3   Seasonal               20250 non-null  string 
 4   Sales Volume           20250 non-null  uint32 
 5   Name                   20250 non-null  string 
 6   Description            20250 non-null  string 
 7   Price                  20250 non-null  float64
 8   Products               20250 non-null  string 
 9   Section                20250 non-null  string 
 10  Season                 20250 non-null  string 
 11  Material               20250 non-null  string 
 12  Manufacturing Country  20250 non-null  string 
 13  Revenue                20250 non-null  float64
 14  Price Segment          20250 non-null  object 
dtypes: floa

In [105]:
# Statistical summary of the dataframe
df.describe(include = "all")

Unnamed: 0,Product ID,Product Position,Promotion,Seasonal,Sales Volume,Name,Description,Price,Products,Section,Season,Material,Manufacturing Country,Revenue,Price Segment
count,20250.0,20250,20250,20250,20250.0,20250,20250,20250.0,20250,20250,20250,20250,20250,20250.0,20250
unique,,3,2,2,,17215,221,,5,2,4,11,12,,3
top,,Aisle,No,No,,PLAID OVERSHIRT,Varsity jacket with elastic collar and long sl...,,jackets,WOMAN,Autumn,Cotton,China,,Low
freq,,7810,11810,10136,,8,333,,11230,13253,7664,3850,4026,,12124
mean,208935.993383,,,,1097.428148,,,41.950592,,,,,,43681.863937,
std,8949.110701,,,,298.236187,,,23.381581,,,,,,22538.257923,
min,110075.0,,,,518.0,,,12.0,,,,,,11228.8,
25%,204444.25,,,,849.0,,,23.95,,,,,,26245.225,
50%,209506.5,,,,990.0,,,35.95,,,,,,38255.645,
75%,214568.75,,,,1364.75,,,53.95,,,,,,55754.775,


Which countries are major manufacturers?

In [106]:
# Visualization of Sales by Country
fig = px.histogram(
    df, 
    x = "Manufacturing Country",
    y = "Sales Volume",
    title = "China, Bangladesh and Turkey were the top 3 countries to produce the most products.",
    text_auto = True,
    color = "Manufacturing Country"
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [107]:
fig.write_image(FIGURES_DIR/ "sales_by_country.svg", width=1000, height=600)

In [108]:
fig.show()

In [109]:
def sales_in_manu_countries():
    country_sales = []
    for i in df["Manufacturing Country"].unique():
        country_sales.append(f"{i}: {df.loc[df["Manufacturing Country"] == i, "Sales Volume"].sum():,}")
    return print(*country_sales, sep = "\n")

In [110]:
sales_in_manu_countries()

Brazil: 896,088
Turkey: 2,700,999
Morocco: 1,814,595
China: 4,454,532
Portugal: 1,553,690
India: 2,228,233
Bangladesh: 3,934,058
Vietnam: 1,335,556
Spain: 1,376,349
Cambodia: 1,069,896
Argentina: 196,276
Pakistan: 662,648


What is the relationship between price levels and sales volume, particularly for high-price items in popular categories like jackets or sweaters?

In [111]:
def correlation(value1: str,value2: str):
    """Correlation formula"""
    result = (value1).corr(value2)
    return print(f"Correlation is {result:.4f}.")

In [112]:
correlation(df["Price"], df["Sales Volume"])

Correlation is -0.3379.


In [113]:
correlation(df["Price"].where(df["Products"] == "jackets"),df["Sales Volume"].where(df["Products"] == "jackets"))

Correlation is -0.3339.


In [114]:
correlation(df["Price"].where(df["Products"] == "sweaters"),df["Sales Volume"].where(df["Products"] == "sweaters"))

Correlation is -0.3316.


In [115]:
fig = px.histogram(
    df,
    x = "Price Segment",
    y = "Sales Volume",
    title = "Low price segment has most sales, then Mid price segment comes 2nd, then High price segment.",
    text_auto = True,
    color = "Price Segment"
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area (area inside axes)
        paper_bgcolor='white'   #Outer canvas (area outside axes)
        ) \
    .update_xaxes(
        title = "Price Segments",
        categoryarray = ["Low", "Mid", "High"],
        showgrid = False,   #White gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        title = "Sales Volume",
        showgrid = False,   #White gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [116]:
fig.write_image(FIGURES_DIR/ "sales_price_segment.svg", width=1000, height=600)

In [117]:
fig.show()

In [118]:
low_mid_diff = percent(
    x = df.loc[df["Price Segment"] == "Low", "Sales Volume"].sum(),
    y = df.loc[df["Price Segment"] == "Mid", "Sales Volume"].sum(),
    z = df["Sales Volume"].sum()
)
mid_high_diff = percent(
    x = df.loc[df["Price Segment"] == "Mid", "Sales Volume"].sum(),
    y = df.loc[df["Price Segment"] == "High", "Sales Volume"].sum(),
    z = df["Sales Volume"].sum()
)
low_high_diff = percent(
    x = df.loc[df["Price Segment"] == "Low", "Sales Volume"].sum(),
    y = df.loc[df["Price Segment"] == "High", "Sales Volume"].sum(),
    z = df["Sales Volume"].sum()
)

In [119]:
print(f"Low price segment products sold {low_mid_diff} more than Mid price segment products.")
print(f"Mid price segment products sold {mid_high_diff} more than High price segment products.")
print(f"Low price segment products sold {low_high_diff} more than High price segment products.")

Low price segment products sold 31.9% more than Mid price segment products.
Mid price segment products sold 25.4% more than High price segment products.
Low price segment products sold 57.4% more than High price segment products.


In [120]:
low_mid_rev = percent(
    x = df.loc[df["Price Segment"] == "Low", "Revenue"].sum(),
    y = df.loc[df["Price Segment"] == "Mid", "Revenue"].sum(),
    z = df["Revenue"].sum()
)
mid_high_rev = percent(
    x = df.loc[df["Price Segment"] == "Mid", "Revenue"].sum(),
    y = df.loc[df["Price Segment"] == "High", "Revenue"].sum(),
    z = df["Revenue"].sum()
)
low_high_rev = percent(
    x = df.loc[df["Price Segment"] == "Low", "Revenue"].sum(),
    y = df.loc[df["Price Segment"] == "High", "Revenue"].sum(),
    z = df["Revenue"].sum()
)

In [121]:
print(f"There is a difference in Revenue of about {low_mid_rev} between Low and Mid price segment products.")
print(f"Mid price segment products sold {mid_high_rev} more than High price segment products.")
print(f"Low price segment products sold {low_high_rev} more than High price segment products.")

There is a difference in Revenue of about -3.1% between Low and Mid price segment products.
Mid price segment products sold 30.5% more than High price segment products.
Low price segment products sold 27.4% more than High price segment products.


In [122]:
fig = px.histogram(
    df,
    x = "Price Segment",
    y = "Revenue",
    title = "",
    category_orders = {"Price Segment": ["Low", "Mid", "High"]},
    color = "Price Segment",
    text_auto = True
) \
.update_layout(
    plot_bgcolor = "white",
    paper_bgcolor = "white"
    ) \
.update_xaxes(
    title = "Price Segment",
    showgrid = False,
    showline = True,
    linecolor = "black",
    linewidth = 2
    ) \
.update_yaxes(
    title = "Revenue",
    showgrid = False,
    showline = True,
    linecolor = "black",
    linewidth = 2
    )

In [123]:
fig.write_image(FIGURES_DIR/ "revenue_price_segment.svg", width = 1000, height = 600)

In [124]:
fig.show()

What are the effects of season on sales volume.

In [125]:
def seasonal_sales():
    sales_numbers = []
    for i in df["Season"].unique():
        sales_numbers.append(f"{i}: {df.loc[df["Season"] == i, "Sales Volume"].sum():,.2f}")
    spr_sum_diff = percent(
        df.loc[df["Season"] == "Summer", "Sales Volume"].sum(), 
        df.loc[df["Season"] == "Spring", "Sales Volume"].sum(), 
        df.loc[df["Season"] == "Spring", "Sales Volume"].sum() + df.loc[df["Season"] == "Summer", "Sales Volume"].sum()
        )
    
    sum_aut_diff = percent(
        df.loc[df["Season"] == "Autumn", "Sales Volume"].sum(),
        df.loc[df["Season"] == "Summer", "Sales Volume"].sum(), 
        df.loc[df["Season"] == "Autumn", "Sales Volume"].sum() + df.loc[df["Season"] == "Summer", "Sales Volume"].sum()
        )
    
    aut_win_diff = percent(
        df.loc[df["Season"] == "Winter", "Sales Volume"].sum(), 
        df.loc[df["Season"] == "Autumn", "Sales Volume"].sum(), 
        df.loc[df["Season"] == "Autumn", "Sales Volume"].sum() + df.loc[df["Season"] == "Winter", "Sales Volume"].sum()
        )
    
    result1 = print("Sales seasonally:",*sales_numbers, sep = "\n")

    result2 = print("=" * 45)

    result3 = print(f"Difference of Sales Volume:\nSpring to Summer sales change: {spr_sum_diff} \nSummer to Autumn sales change:  {sum_aut_diff} \nAutumn to Winter sales change: {aut_win_diff}")
    
    return result1 and result2 and result3

In [126]:
seasonal_sales()

Sales seasonally:
Winter: 6,042,207.00
Autumn: 7,992,369.00
Spring: 4,745,927.00
Summer: 3,442,417.00
Difference of Sales Volume:
Spring to Summer sales change: -15.9% 
Summer to Autumn sales change:  39.8% 
Autumn to Winter sales change: -13.9%


In [155]:
fig = px.histogram(
    df, 
    x = "Season", 
    y = "Sales Volume", 
    title = "Majority of the sales seasonally occurred in Autumn followed by Winter, Spring and Summer with \ndeclining sales.", 
    color = "Season", 
    category_orders={"Season": ["Spring", "Summer", "Autumn", "Winter"]},
    text_auto = True
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [157]:
fig.write_image(FIGURES_DIR/ "seasonal_sales.svg", width=1100, height=600)

In [158]:
fig.show()

What are the most sold materials? How does this trend looks like with seasons? 

In [130]:
fig = px.histogram(
    df, 
    x = "Material", 
    y = "Revenue", 
    title = "Wool, Cotton and Wool blend are top 3 materials that brought highest revenue.",
    text_auto = True,
    color = "Material"
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [131]:
fig.write_image(FIGURES_DIR/ "material_sales.svg", width=1000, height=600)

In [132]:
fig.show()

In [133]:
fig = px.histogram(
    df, 
    x = "Season", 
    y = "Revenue", 
    title = "Wool, Cotton and Wool blend are top 3 materials that brought highest revenue.",
    category_orders = {"Season": ["Spring", "Summer", "Autumn", "Winter"]},
    text_auto = True,
    color = "Material"
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [134]:
fig.write_image(FIGURES_DIR/ "material_sales_seasonally.svg", width=1000, height=600)

In [135]:
fig.show()

How much do promotions boost sales volume across different product positions like Front of Store versus Aisle?

In [136]:
def compute_promo_sums(promo_col="Promotion", pos_col="Product Position", value_col="Sales Volume"):
    global df
    # Overall sums by promotion
    global num_yes
    num_yes = df.loc[df[promo_col] == "Yes", value_col].sum()
    global num_no
    num_no = df.loc[df[promo_col] == "No", value_col].sum()
    global total_sales
    total_sales = df[value_col].sum()
    
    # Yes promotion by position
    global yes_aisle
    yes_aisle = df.loc[(df[promo_col] == "Yes") & (df[pos_col] == "Aisle"), value_col].sum()
    global yes_endcap
    yes_endcap = df.loc[(df[promo_col] == "Yes") & (df[pos_col] == "End-cap"), value_col].sum()
    global yes_fos
    yes_fos = df.loc[(df[promo_col] == "Yes") & (df[pos_col] == "Front of Store"), value_col].sum()
    
    # No promotion by position
    global no_aisle
    no_aisle = df.loc[(df[promo_col] == "No") & (df[pos_col] == "Aisle"), value_col].sum()
    global no_endcap
    no_endcap = df.loc[(df[promo_col] == "No") & (df[pos_col] == "End-cap"), value_col].sum()
    global no_fos
    no_fos = df.loc[(df[promo_col] == "No") & (df[pos_col] == "Front of Store"), value_col].sum()

    promo_nums = print(f" Total sales when promoted: {num_yes:,} \n \
    Total sales of products on Aisle (promoted): {yes_aisle:,},\n \
    Total sales of products on End-Cap (promoted): {yes_endcap:,} \n \
    Total sales of products on Front of store (promoted): {yes_fos:,} \n Total sales when not promoted: {num_no:,} \n \
    Total sales when products on Aisle (not promoted): {no_aisle:,} \n \
    Total sales when products on End-Cap (not promoted): {no_endcap:,} \n \
    Total sales when products on Front of store (not promoted): {no_fos:,}", sep = "\n"
)

    return promo_nums

In [137]:
compute_promo_sums()

 Total sales when promoted: 11,920,036 
     Total sales of products on Aisle (promoted): 4,448,881,
     Total sales of products on End-Cap (promoted): 4,060,619 
     Total sales of products on Front of store (promoted): 3,410,536 
 Total sales when not promoted: 10,302,884 
     Total sales when products on Aisle (not promoted): 4,067,357 
     Total sales when products on End-Cap (not promoted): 3,410,969 
     Total sales when products on Front of store (not promoted): 2,824,558


In [138]:
#Visualization
fig = px.histogram(
    df,
    x = "Promotion",
    y = "Sales Volume",
    title = f"Promotions boosted sales in volume by {percent(num_yes, num_no, total_sales)} overall.",
    color = "Product Position",
    text_auto = True
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [139]:
fig.write_image(FIGURES_DIR/ "promotion's_impact.svg", width=1000, height=600)

In [140]:
fig.show()

In [141]:
print(f"Promotion's boost for Aisle products = {percent(yes_aisle, no_aisle, (yes_aisle + no_aisle))} \n \
    Promotion's boost for End-cap products = {percent(yes_endcap, no_endcap, (yes_aisle + no_aisle))} \n \
    Promotion's boost for Front of Store products = {percent(yes_fos, no_fos, (yes_aisle + no_aisle))}.")

Promotion's boost for Aisle products = 4.5% 
     Promotion's boost for End-cap products = 7.6% 
     Promotion's boost for Front of Store products = 6.9%.


What products achieve the highest sales volume, and how does this vary by season?

In [161]:
fig = px.histogram(
    df,
    x = "Products",
    y = "Sales Volume", 
    facet_col = "Season",
    facet_col_wrap = 2,
    category_orders = {"Season": ["Spring", "Summer", "Autumn", "Winter"]},
    color = "Products",
    text_auto = True,
    height = 700,
    width = 1300
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white',  #Outer canvas
        title = "Product Sales by Season Volume",
        title_x = 0.5,  # Center title
        font_size = 10
        ) \
    .update_xaxes(
        title = "Products",
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        title = "Sales Volume",
        showgrid = False,       #Gridlines
        showline=True,          #Axis line
        linecolor='black',      #Axis line color
        linewidth=2             #Axis line width
        )

In [162]:
fig.write_image(FIGURES_DIR/ "seasonal_product_performance.svg", width=1100, height=700)

In [144]:
fig.show()

In [145]:
pivot_df = df.pivot_table(
    values = "Sales Volume", 
    index = "Products", 
    columns = "Season", 
    aggfunc = "sum", 
    fill_value = 0
).astype(int)

In [146]:
pivot_df

Season,Autumn,Spring,Summer,Winter
Products,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
jackets,4487821,2618619,1880979,3359090
jeans,253067,154627,112224,201390
shoes,977698,592059,412963,694262
sweaters,1273641,743519,580954,1000629
t-shirts,1000142,637103,455297,786836


How do sales volumes differ between men's and women's?

In [163]:
fig = px.histogram(
    df, 
    x = "Products", 
    y = "Sales Volume",
    title = "Majority of the Products are bought by women.", 
    color = "Section",
    text_auto = True
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [164]:
fig.write_image(FIGURES_DIR/ "section_vs_products.svg", width = 1000, height = 600)

In [149]:
fig.show()

In [165]:
fig = px.histogram(
    df, 
    x = "Material", 
    y = "Sales Volume",
    title = "Majority of the Products are bought by women.", 
    color = "Section",
    text_auto = True
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [151]:
fig.show()

In [166]:
fig = px.histogram(
    df, 
    x = "Section", 
    y = "Sales Volume",
    title = "Majority of the Products were bought by women.", 
    color = "Section",
    text_auto = True
    ) \
    .update_layout(
        plot_bgcolor='white',   #Inner plot area
        paper_bgcolor='white'   #Outer canvas
        ) \
    .update_xaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        ) \
    .update_yaxes(
        showgrid = False,   #Gridlines
        showline=True,      #Axis line
        linecolor='black',  #Axis line color
        linewidth=2         #Axis line width
        )

In [153]:
fig.show()

In [168]:
men_sales = df.loc[df["Section"] == "MAN", "Sales Volume"].sum()

women_sales = df.loc[df["Section"] == "WOMAN", "Sales Volume"].sum()

man_woman_sales = percent(x = women_sales, y = men_sales, z = df["Sales Volume"].sum())

women_sales_percent = percent(x = women_sales, y = 0, z = df["Sales Volume"].sum())

men_sales_percent = percent(x = men_sales, y = 0, z = df["Sales Volume"].sum())

print(f"Women bought {man_woman_sales} more than men.")

print(f"Women bought {women_sales:,} units that's {women_sales_percent} of the total sales. \n\
Men bought {men_sales:,} units that's {men_sales_percent} of the total sales.")

Women bought 35.5% more than men.
Women bought 15,060,302 units that's 67.8% of the total sales. 
Men bought 7,162,618 units that's 32.2% of the total sales.
