In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns

## Data Wrangling

In [2]:
df = pd.read_excel('supermarket_sales.xlsx')

In [3]:
df.sample(10)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
995,886-18-2897,A,Yangon,Normal,Female,Food_and_beverages,56.56,5,14.14,296.94,3/22/2019,19:06:00,Credit card,282.8,4.761905,14.14,4.5
972,522-57-8364,A,Yangon,Member,Male,Fashion_accessories,51.34,8,20.536,431.256,1/31/2019,10:00:00,Ewallet,410.72,4.761905,20.536,7.6
512,521-18-7827,C,Naypyitaw,Member,Male,Home_and_lifestyle,39.39,5,9.8475,206.7975,1/22/2019,20:46:00,Credit card,196.95,4.761905,9.8475,8.7
382,731-81-9469,C,Naypyitaw,Member,Female,Sports_and_travel,89.8,10,44.9,942.9,1/23/2019,13:00:00,Credit card,898.0,4.761905,44.9,5.4
688,287-21-9091,A,Yangon,Normal,Male,Home_and_lifestyle,74.67,9,33.6015,705.6315,1/22/2019,10:55:00,Ewallet,672.03,4.761905,33.6015,9.4
625,607-76-6216,C,Naypyitaw,Member,Female,Fashion_accessories,92.49,5,23.1225,485.5725,2019-02-03 00:00:00,16:35:00,Credit card,462.45,4.761905,23.1225,8.6
220,190-14-3147,B,Mandalay,Normal,Female,Health_and_beauty,17.97,4,3.594,75.474,2/23/2019,20:43:00,Ewallet,71.88,4.761905,3.594,6.4
878,706-36-6154,A,Yangon,Member,Male,Home_and_lifestyle,19.36,9,8.712,182.952,1/18/2019,18:43:00,Ewallet,174.24,4.761905,8.712,8.7
769,870-76-1733,A,Yangon,Member,Female,Food_and_beverages,14.23,5,3.5575,74.7075,2019-01-02 00:00:00,10:08:00,Credit card,71.15,4.761905,3.5575,4.4
348,803-83-5989,C,Naypyitaw,Normal,Male,Home_and_lifestyle,55.73,6,16.719,351.099,2/24/2019,10:55:00,Ewallet,334.38,4.761905,16.719,7.0


In [4]:
df.shape

(1000, 17)

In [5]:
df.dtypes

Invoice ID                  object
Branch                      object
City                        object
Customer type               object
Gender                      object
Product line                object
Unit price                 float64
Quantity                     int64
Tax 5%                     float64
Total                      float64
Date                        object
Time                        object
Payment                     object
cogs                       float64
gross margin percentage    float64
gross income               float64
Rating                     float64
dtype: object

In [6]:
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['invoice_id', 'branch', 'city', 'customer_type', 'gender',
       'product_line', 'unit_price', 'quantity', 'tax_5%', 'total', 'date',
       'time', 'payment', 'cogs', 'gross_margin_percentage', 'gross_income',
       'rating'],
      dtype='object')

In [7]:
df = df.map(lambda x: x.lower() if type(x) == str else x)

In [8]:
df["date"] = pd.to_datetime(df["date"])
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time
df['time'] = df['date'] + pd.to_timedelta(df['time'].astype(str))
df = df.drop(columns=['date'])

In [9]:
df.dtypes

invoice_id                         object
branch                             object
city                               object
customer_type                      object
gender                             object
product_line                       object
unit_price                        float64
quantity                            int64
tax_5%                            float64
total                             float64
time                       datetime64[ns]
payment                            object
cogs                              float64
gross_margin_percentage           float64
gross_income                      float64
rating                            float64
dtype: object

In [10]:
df = df[ ["city", "branch","time", "invoice_id","product_line", "unit_price", "quantity", "tax_5%", "total", "payment",  "customer_type", "gender",
       "rating", "cogs", "gross_margin_percentage", "gross_income"]]

In [11]:
df.sort_values(["branch","city","time"], ascending=[True,True,True], inplace=True)

In [12]:
df.groupby("city")["branch"].unique()

city
mandalay     [b]
naypyitaw    [c]
yangon       [a]
Name: branch, dtype: object

In [13]:
df.groupby("product_line").get_group("health_and_beauty")['invoice_id']

854    443-82-0585
975    397-25-8725
916    787-87-2010
807    269-10-8440
951    160-22-2687
          ...     
632    450-42-3339
349    838-78-4295
576    607-65-2441
441    704-20-4138
404    699-01-4164
Name: invoice_id, Length: 152, dtype: object

In [14]:
df['gross_margin_percentage'].unique()

array([4.76190476])

In [15]:
df.drop(columns=["invoice_id",'gross_margin_percentage','branch'], inplace=True)

In [16]:
df.reset_index(drop=True, inplace=True)

In [17]:
df.sample(10)

Unnamed: 0,city,time,product_line,unit_price,quantity,tax_5%,total,payment,customer_type,gender,rating,cogs,gross_income
3,yangon,2019-01-01 14:47:00,home_and_lifestyle,47.59,8,19.036,399.756,cash,member,male,5.7,380.72,19.036
284,yangon,2019-07-03 18:35:00,home_and_lifestyle,28.31,4,5.662,118.902,cash,member,female,8.2,113.24,5.662
922,naypyitaw,2019-06-02 17:20:00,fashion_accessories,96.98,4,19.396,407.316,ewallet,member,male,9.4,387.92,19.396
942,naypyitaw,2019-08-01 11:42:00,home_and_lifestyle,55.57,3,8.3355,175.0455,credit card,member,male,5.9,166.71,8.3355
144,yangon,2019-02-27 18:24:00,food_and_beverages,18.85,10,9.425,197.925,ewallet,member,male,5.6,188.5,9.425
338,yangon,2019-12-03 12:43:00,sports_and_travel,98.4,7,34.44,723.24,credit card,member,female,8.7,688.8,34.44
489,mandalay,2019-02-26 15:10:00,food_and_beverages,26.6,6,7.98,167.58,ewallet,member,male,4.9,159.6,7.98
573,mandalay,2019-04-02 18:53:00,food_and_beverages,23.34,4,4.668,98.028,ewallet,member,male,7.4,93.36,4.668
479,mandalay,2019-02-25 13:22:00,food_and_beverages,62.85,4,12.57,263.97,ewallet,member,female,8.7,251.4,12.57
584,mandalay,2019-05-03 12:29:00,fashion_accessories,21.94,5,5.485,115.185,ewallet,member,male,5.3,109.7,5.485


In [18]:
df.to_csv('supermarket_sales_clean.csv', index=False)

## Plotting the data

### Please Check ('cln_market.ipynb')