In [0]:
spark

In [0]:
# Mount using dbutils : Mount Storage container into DBFS - Databricks file system

storage_account_key = "vAh0MYzXTTOUynsGtdwk6AFZT+0dNPazQiueINdLM2hyZY+cjJQNMmaawILe0DSKB01YmGf85Ma1+AStxBfsqw=="

storage_account = "myfirstblobstoragesample"
container_name = "databrickssample"
file_name = "super_store.csv"
mountPoint = "/mnt/data/"
if not any(mount.mountPoint == mountPoint for mount in dbutils.fs.mounts()):
  try:
    dbutils.fs.mount(
      source = "wasbs://{}@{}.blob.core.windows.net".format(container_name, storage_account),
      mount_point = mountPoint,
      extra_configs = {'fs.azure.account.key.' + storage_account + '.blob.core.windows.net': storage_account_key}
    )
    print("mount succeeded!")
  except Exception as e:
    print("mount exception", e)


In [0]:
# Verify mount point (/mnt/data) with dbutils.fs.mounts()

dbutils.fs.mounts()

[MountInfo(mountPoint='/databricks-datasets', source='databricks-datasets', encryptionType=''),
 MountInfo(mountPoint='/Volumes', source='UnityCatalogVolumes', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-tracking', source='databricks/mlflow-tracking', encryptionType=''),
 MountInfo(mountPoint='/databricks-results', source='databricks-results', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-registry', source='databricks/mlflow-registry', encryptionType=''),
 MountInfo(mountPoint='/Volume', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/volumes', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/mnt/data/', source='wasbs://databrickssample@myfirstblobstoragesample.blob.core.windows.net', encryptionType=''),
 MountInfo(mountPoint='/', source='DatabricksRoot', encryptionType=''),
 MountInfo(mountPoint='/volume', source='DbfsReserved', encryptionType='')]

In [0]:
#  List the contents with dbutils.fs.ls()

dbutils.fs.ls("mnt/data")

[FileInfo(path='dbfs:/mnt/data/sales_data.csv', name='sales_data.csv', size=496, modificationTime=1729802751000),
 FileInfo(path='dbfs:/mnt/data/super_store.csv', name='super_store.csv', size=2288478, modificationTime=1729965173000)]

In [0]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.colors as colors
pio.templates.default = "plotly_white"

In [0]:
# read CSV file and load it in to dataframe
csv_file_url = "/dbfs/mnt/data/super_store.csv"
pd_DF = pd.read_csv(csv_file_url)

In [0]:
# describe above dataframe

pd_DF.describe()

Unnamed: 0,RowID,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,55190.379428,229.858001,3.789574,0.156203,28.656896
std,2885.163629,32063.69335,623.245101,2.22511,0.206452,234.260108
min,1.0,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,90008.0,209.94,5.0,0.2,29.364
max,9994.0,99301.0,22638.48,14.0,0.8,8399.976


In [0]:
pd_DF.columns

Index(['RowID', 'OrderID', 'OrderDate', 'ShipDate', 'ShipMode', 'CustomerID',
       'CustomerName', 'Segment', 'Country', 'City', 'State', 'Postal Code',
       'Region', 'Product ID', 'Category', 'SubCategory', 'ProductName',
       'Sales', 'Quantity', 'Discount', 'Profit', 'Order Month', 'Order Year',
       'Order Day of Week'],
      dtype='object')

In [0]:
pd_DF['OrderDate'] = pd.to_datetime(pd_DF['OrderDate'])
pd_DF['ShipDate'] = pd.to_datetime(pd_DF['ShipDate']) 

pd_DF['Order Month'] = pd_DF['OrderDate'].dt.month 
pd_DF['Order Year'] = pd_DF['OrderDate'].dt.year
pd_DF['Order Day of Week'] = pd_DF['OrderDate'].dt.dayofweek

In [0]:
# Line Chart :  display total sales order by month

sales_by_month = pd_DF.groupby(['Order Month'])['Sales'].sum().reset_index()
fig = px.line(sales_by_month, 
              x='Order Month', 
              y='Sales', 
              title='Monthly Sales Analysis')
fig.show()

In [0]:
# Pie-Chart : Sales By Category

sales_by_category = pd_DF.groupby('Category')['Sales'].sum().reset_index()

fig = px.pie(sales_by_category, 
             values='Sales', 
             names='Category', 
             hole=0.3, 
             color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(title_text='Sales Analysis by Category', title_font=dict(size=24))

fig.show()

In [0]:
# Bar-Chart : Sales By Sub-Category

sales_by_subcategory = pd_DF.groupby('SubCategory')['Sales'].sum().reset_index()
fig = px.bar(sales_by_subcategory, 
             x='SubCategory', 
             y='Sales', 
             title='Sales Analysis by Sub-Category')
fig.show()

In [0]:
# Bar-Chart : Sales and Profit analysis by customer segments

sales_profit_by_segment = pd_DF.groupby('Segment').agg({'Sales': 'sum', 'Profit': 'sum'}).reset_index()

color_palette = colors.qualitative.Safe

fig = go.Figure()
fig.add_trace(go.Bar(x=sales_profit_by_segment['Segment'], 
                     y=sales_profit_by_segment['Sales'], 
                     name='Sales',
                     marker_color=color_palette[0]))
fig.add_trace(go.Bar(x=sales_profit_by_segment['Segment'], 
                     y=sales_profit_by_segment['Profit'], 
                     name='Profit',
                     marker_color=color_palette[1]))

fig.update_layout(title='Sales and Profit Analysis by Customer Segment',
                  xaxis_title='Customer Segment', yaxis_title='Amount')

fig.show()

In [0]:
sales_profit_by_segment = pd_DF.groupby('Segment').agg({'Sales': 'sum', 'Profit': 'sum'}).reset_index()
sales_profit_by_segment['Sales_to_Profit_Ratio'] = sales_profit_by_segment['Sales'] / sales_profit_by_segment['Profit']

print(sales_profit_by_segment[['Segment', 'Sales_to_Profit_Ratio']])