In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# %matplotlib inline

In [2]:
df = pd.read_csv('../Datasets/store_sku_ba_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store ID      1000 non-null   object 
 1   SKU ID        1000 non-null   object 
 2   Total Visits  1000 non-null   int64  
 3   Transactions  1000 non-null   int64  
 4   Revenue       1000 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB


In [4]:
df.head()

Unnamed: 0,Store ID,SKU ID,Total Visits,Transactions,Revenue
0,Store_1,SKU_1,606,573,74688.67
1,Store_1,SKU_2,695,27,1390.6
2,Store_1,SKU_3,389,285,50679.63
3,Store_1,SKU_4,689,505,16764.34
4,Store_1,SKU_5,522,230,41928.4


In [5]:
stores = df['Store ID'].unique()
stores

array(['Store_1', 'Store_2', 'Store_3', 'Store_4', 'Store_5', 'Store_6',
       'Store_7', 'Store_8', 'Store_9', 'Store_10', 'Store_11',
       'Store_12', 'Store_13', 'Store_14', 'Store_15', 'Store_16',
       'Store_17', 'Store_18', 'Store_19', 'Store_20', 'Store_21',
       'Store_22', 'Store_23', 'Store_24', 'Store_25', 'Store_26',
       'Store_27', 'Store_28', 'Store_29', 'Store_30', 'Store_31',
       'Store_32', 'Store_33', 'Store_34', 'Store_35', 'Store_36',
       'Store_37', 'Store_38', 'Store_39', 'Store_40', 'Store_41',
       'Store_42', 'Store_43', 'Store_44', 'Store_45', 'Store_46',
       'Store_47', 'Store_48', 'Store_49', 'Store_50'], dtype=object)

In [6]:
products = df['SKU ID'].unique()
products

array(['SKU_1', 'SKU_2', 'SKU_3', 'SKU_4', 'SKU_5', 'SKU_6', 'SKU_7',
       'SKU_8', 'SKU_9', 'SKU_10', 'SKU_11', 'SKU_12', 'SKU_13', 'SKU_14',
       'SKU_15', 'SKU_16', 'SKU_17', 'SKU_18', 'SKU_19', 'SKU_20'],
      dtype=object)

In [7]:
print(f"Total Number of Stores = {len(stores)}")
print(f"Total Number of product = {len(products)}")

Total Number of Stores = 50
Total Number of product = 20


# Data Cleaning

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df.columns

Index(['Store ID', 'SKU ID', 'Total Visits', 'Transactions', 'Revenue'], dtype='object')

    INFO: There are total 1000 rows having 50 store and 20 products. There are no duplicate or unfilled value.

# Feature Engineering

In [10]:
# ['Store ID', 'SKU ID', 'Total Visits', 'Transactions', 'Revenue']

#### TASK 1: Calculate the conversion rate at both the store level and the SKU level (i.e., Transactions /Total Visits).

#### Store level conversion rate

In [11]:
df_store_level = df.groupby('Store ID')[['Total Visits', 'Transactions']].sum()

In [12]:
df_store_level

Unnamed: 0_level_0,Total Visits,Transactions
Store ID,Unnamed: 1_level_1,Unnamed: 2_level_1
Store_1,11296,6054
Store_10,12173,6311
Store_11,9362,4526
Store_12,8698,3310
Store_13,11846,5155
Store_14,12804,4674
Store_15,8922,3090
Store_16,11857,6122
Store_17,11468,5575
Store_18,10021,5340


In [13]:
df_store_level['Conversion Rate'] = df_store_level['Transactions']/df_store_level['Total Visits']

In [14]:
df_store_level

Unnamed: 0_level_0,Total Visits,Transactions,Conversion Rate
Store ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Store_1,11296,6054,0.535942
Store_10,12173,6311,0.518442
Store_11,9362,4526,0.483444
Store_12,8698,3310,0.380547
Store_13,11846,5155,0.435168
Store_14,12804,4674,0.365042
Store_15,8922,3090,0.346335
Store_16,11857,6122,0.516319
Store_17,11468,5575,0.486135
Store_18,10021,5340,0.532881


In [16]:
df_store_level = df_store_level.reset_index()
pickle.dump(df_store_level, open("../Datasets/processed_data/store_level_transaction.pkl", 'wb'))