In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_tran = pd.read_excel('./KPMG.xlsx', sheet_name="Transactions",header=1)
df_tran.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


In [3]:
print(f"There are {df_tran.shape[0]} rows and {df_tran.shape[1]} columns.")

There are 20000 rows and 13 columns.


In [4]:
# To check the data types
df_tran.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  float64       
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [5]:
# to change the format of product_first_sold_date from float64 to datetime
df_tran.product_first_sold_date = pd.to_datetime(df_tran.product_first_sold_date).dt.date

In [6]:
df_tran.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,1970-01-01
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,1970-01-01
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,1970-01-01
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,1970-01-01
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,1970-01-01


In [7]:
# printing all the columns names
df_tran.columns

Index(['transaction_id', 'product_id', 'customer_id', 'transaction_date',
       'online_order', 'order_status', 'brand', 'product_line',
       'product_class', 'product_size', 'list_price', 'standard_cost',
       'product_first_sold_date'],
      dtype='object')

In [8]:
df_tran.transaction_id.value_counts()

1        1
13331    1
13338    1
13337    1
13336    1
        ..
6667     1
6666     1
6665     1
6664     1
20000    1
Name: transaction_id, Length: 20000, dtype: int64

In [9]:
df_tran.product_id.value_counts()

0      1378
3       354
1       311
35      268
38      267
       ... 
71      137
8       136
16      136
100     130
47      121
Name: product_id, Length: 101, dtype: int64

In [10]:
df_tran.customer_id.value_counts()

2183    14
2476    14
1068    14
1672    13
2912    13
        ..
898      1
2352     1
1846     1
3279     1
1757     1
Name: customer_id, Length: 3494, dtype: int64

In [11]:
df_tran.transaction_date.dt.date

0        2017-02-25
1        2017-05-21
2        2017-10-16
3        2017-08-31
4        2017-10-01
            ...    
19995    2017-06-24
19996    2017-11-09
19997    2017-04-14
19998    2017-07-03
19999    2017-09-22
Name: transaction_date, Length: 20000, dtype: object

In [12]:
df_tran.transaction_date.dt.month.value_counts()

10    1771
8     1749
7     1717
5     1685
1     1682
11    1665
3     1656
4     1655
12    1644
2     1623
6     1581
9     1572
Name: transaction_date, dtype: int64

In [13]:
# 1 - True and 0 - False
df_tran.online_order.value_counts()

1.0    9829
0.0    9811
Name: online_order, dtype: int64

In [14]:
df_tran.order_status.value_counts()

Approved     19821
Cancelled      179
Name: order_status, dtype: int64

In [15]:
# As order status containde Cancelled so it will no longer needed so we are removing the ordered Cancelled.
df_tran = df_tran[df_tran['order_status'] != 'Cancelled'] 

In [16]:
df_tran.order_status.value_counts()

Approved    19821
Name: order_status, dtype: int64

In [17]:
df_tran.brand.value_counts()

Solex             4211
Giant Bicycles    3283
WeareA2B          3265
OHM Cycles        3016
Trek Bicycles     2965
Norco Bicycles    2885
Name: brand, dtype: int64

In [18]:
df_tran.product_line.value_counts()

Standard    14048
Road         3932
Touring      1225
Mountain      420
Name: product_line, dtype: int64

In [19]:
df_tran.product_class.value_counts()

medium    13701
high       2978
low        2946
Name: product_class, dtype: int64

In [20]:
df_tran.product_size.value_counts()

medium    12876
large      3938
small      2811
Name: product_size, dtype: int64

In [21]:
df_tran.list_price.value_counts()

2091.47    461
1403.50    394
71.49      273
1231.15    232
1890.39    231
          ... 
126.36       1
1300.96      1
1697.27      1
26.15        1
867.92       1
Name: list_price, Length: 295, dtype: int64

In [22]:
# To check the NAN values in each columns.

df_tran.isna().sum()

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               354
order_status                 0
brand                      196
product_line               196
product_class              196
product_size               196
list_price                   0
standard_cost              196
product_first_sold_date    196
dtype: int64

In [23]:
# Checking any duplicate values are present in dataset or not

df_tran[df_tran.duplicated()].sum()

transaction_id             0.0
product_id                 0.0
customer_id                0.0
online_order               0.0
order_status               0.0
brand                      0.0
product_line               0.0
product_class              0.0
product_size               0.0
list_price                 0.0
standard_cost              0.0
product_first_sold_date    0.0
dtype: float64

In [24]:
# Adding Profit column in dataset beside standard_cost 

Profit = df_tran.list_price - df_tran.standard_cost
# Adding new column on the 12th position beside seandard_cost
df_tran.insert(12,'Profit',Profit)

In [25]:
df_tran.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,Profit,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,17.87,1970-01-01
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,1702.55,1970-01-01
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,1544.61,1970-01-01
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,817.36,1970-01-01
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,1055.82,1970-01-01


In [26]:
df_tran.to_excel("KPMG_clean.xlsx", sheet_name= 'Transactions', index=False)