---
# **PHARMACY DASHBOARD PREREQUISITES**
---

---
#### IMPORT LIBRARIES


In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.pipeline import Pipeline

---
#### LOAD DATASET


In [5]:
customer_df = pd.read_csv("customer_data.csv")
product_df = pd.read_csv("product_master.csv")
transaction_df = pd.read_csv("transaction_data.csv")

In [6]:
# customer data
customer_df.head()

Unnamed: 0,CustomerID,CustomerType,AgeGroup,Gender,Location,DateOfFirstPurchase,LastPurchaseDate
0,C00000,Returning,18-25,Male,Lake Richardberg,2024-02-14,2024-06-07
1,C00001,Returning,18-25,Other,Kellychester,2023-08-02,2024-07-12
2,C00002,New,18-25,Female,Longbury,2024-03-29,2024-08-04
3,C00003,Returning,<18,Other,Kellychester,2024-03-22,2024-04-08
4,C00004,Returning,26-40,Other,Brownberg,2024-01-05,2025-02-16


In [7]:
# product master
product_df.head()
# NOTE: Reorder Level = Average Daily Usage × Lead Time (in days)

Unnamed: 0,ProductID,ProductName,Category,CostPrice,StockQuantity,ReorderLevel,ExpiryDate,SellingPrice
0,P0000,Product rwHC,Wellness,352.1,313,19,2027-12-14,462.06
1,P0001,Product FUtF,Equipment,272.69,459,11,2026-06-10,370.96
2,P0002,Product KRjS,Prescription,161.67,463,25,2027-06-11,219.52
3,P0003,Product Ixmd,Wellness,408.76,421,46,2027-04-12,541.41
4,P0004,Product jnwE,Wellness,345.52,126,30,2026-11-22,487.45


In [8]:
# transaction data
transaction_df.head()

Unnamed: 0,TransactionID,Date,ProductID,Quantity,DiscountPercent,PaymentMethod,UnitPrice,TotalAmount,FinalAmount,CustomerID
0,T0000000,2023-11-09,P0927,8,15,UPI,643.38,5147.04,4374.98,C28772
1,T0000001,2024-02-14,P0615,8,30,Card,425.06,3400.48,2380.34,
2,T0000002,2023-10-24,P0910,4,0,UPI,149.23,596.92,596.92,C39200
3,T0000003,2024-10-20,P0753,1,10,Cash,80.37,80.37,72.33,C11128
4,T0000004,2025-02-04,P0333,9,30,Card,127.81,1150.29,805.2,C37550


Merging datasets

In [10]:
c_t_df = pd.merge(customer_df, transaction_df, on="CustomerID")
final_df = pd.merge(c_t_df, product_df, on="ProductID")
final_df.head()

Unnamed: 0,CustomerID,CustomerType,AgeGroup,Gender,Location,DateOfFirstPurchase,LastPurchaseDate,TransactionID,Date,ProductID,...,UnitPrice,TotalAmount,FinalAmount,ProductName,Category,CostPrice,StockQuantity,ReorderLevel,ExpiryDate,SellingPrice
0,C00000,Returning,18-25,Male,Lake Richardberg,2024-02-14,2024-06-07,T0020173,2025-02-14,P0064,...,260.28,780.84,780.84,Product Cohk,Wellness,181.79,99,26,2026-11-20,260.28
1,C00001,Returning,18-25,Other,Kellychester,2023-08-02,2024-07-12,T0022207,2025-05-05,P0805,...,510.17,4081.36,3469.16,Product jMJW,Equipment,436.77,319,42,2026-08-11,510.17
2,C00001,Returning,18-25,Other,Kellychester,2023-08-02,2024-07-12,T0033667,2024-11-08,P0394,...,490.52,4905.2,3678.9,Product zsBG,Wellness,337.36,343,43,2026-06-28,490.52
3,C00002,New,18-25,Female,Longbury,2024-03-29,2024-08-04,T0019925,2024-12-01,P0237,...,514.67,5146.7,4374.7,Product ipVZ,OTC,410.13,451,31,2026-04-03,514.67
4,C00002,New,18-25,Female,Longbury,2024-03-29,2024-08-04,T0025197,2024-08-04,P0663,...,533.44,2667.2,2533.84,Product usEA,OTC,374.32,423,36,2028-02-28,533.44


In [11]:
final_df.shape

(90000, 23)

In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CustomerID           90000 non-null  object 
 1   CustomerType         90000 non-null  object 
 2   AgeGroup             90000 non-null  object 
 3   Gender               90000 non-null  object 
 4   Location             90000 non-null  object 
 5   DateOfFirstPurchase  90000 non-null  object 
 6   LastPurchaseDate     90000 non-null  object 
 7   TransactionID        90000 non-null  object 
 8   Date                 90000 non-null  object 
 9   ProductID            90000 non-null  object 
 10  Quantity             90000 non-null  int64  
 11  DiscountPercent      90000 non-null  int64  
 12  PaymentMethod        90000 non-null  object 
 13  UnitPrice            90000 non-null  float64
 14  TotalAmount          90000 non-null  float64
 15  FinalAmount          90000 non-null 

Automated cleaning

In [14]:
# GET datetime columns
datetime_list = []

# take each column
for col in final_df.columns:
    # Drop nulls and convert to string
    non_null_data = final_df[col].dropna().astype(str)
    
    # Sample min(20, available rows)
    sample_size = min(20, len(non_null_data))
    sample_rows = non_null_data.sample(sample_size)
    
    count = 0

    if pd.api.types.is_object_dtype(final_df[col]):
        for item in sample_rows:
            try:
                pd.to_datetime(item)
                count += 1
            except Exception:
                continue

        if count / sample_size >= 0.8:
            datetime_list.append(col)

datetime_list


['DateOfFirstPurchase', 'LastPurchaseDate', 'Date', 'ExpiryDate']

In [15]:
# convert datetime_list to datetime
for dt in datetime_list:
    final_df[dt] = pd.to_datetime(final_df[dt])

final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   CustomerID           90000 non-null  object        
 1   CustomerType         90000 non-null  object        
 2   AgeGroup             90000 non-null  object        
 3   Gender               90000 non-null  object        
 4   Location             90000 non-null  object        
 5   DateOfFirstPurchase  90000 non-null  datetime64[ns]
 6   LastPurchaseDate     90000 non-null  datetime64[ns]
 7   TransactionID        90000 non-null  object        
 8   Date                 90000 non-null  datetime64[ns]
 9   ProductID            90000 non-null  object        
 10  Quantity             90000 non-null  int64         
 11  DiscountPercent      90000 non-null  int64         
 12  PaymentMethod        90000 non-null  object        
 13  UnitPrice            90000 non-

In [16]:
# filter out the id column
num_cat_col = [col for col in final_df.columns if 'ID' not in col]
num_cat_col

['CustomerType',
 'AgeGroup',
 'Gender',
 'Location',
 'DateOfFirstPurchase',
 'LastPurchaseDate',
 'Date',
 'Quantity',
 'DiscountPercent',
 'PaymentMethod',
 'UnitPrice',
 'TotalAmount',
 'FinalAmount',
 'ProductName',
 'Category',
 'CostPrice',
 'StockQuantity',
 'ReorderLevel',
 'ExpiryDate',
 'SellingPrice']

In [17]:
# cleaning the data (imputing the null values)
num_col = final_df[num_cat_col].select_dtypes(include=['number']).columns.tolist()
num_col

['Quantity',
 'DiscountPercent',
 'UnitPrice',
 'TotalAmount',
 'FinalAmount',
 'CostPrice',
 'StockQuantity',
 'ReorderLevel',
 'SellingPrice']

In [18]:
# categorical column
cat_col = final_df[num_cat_col].select_dtypes(include=['object', 'category']).columns.tolist()
cat_col

['CustomerType',
 'AgeGroup',
 'Gender',
 'Location',
 'PaymentMethod',
 'ProductName',
 'Category']

In [19]:
# numerical
numeric_but_cat = [col for col in num_col if final_df[col].nunique() < 10]
numeric_but_cat

# extend the list
for col in numeric_but_cat:
    final_df[col] = final_df[col].astype('object')
    cat_col.append(col)
    num_col.remove(col)

print(cat_col)
print(num_col)

['CustomerType', 'AgeGroup', 'Gender', 'Location', 'PaymentMethod', 'ProductName', 'Category', 'DiscountPercent']
['Quantity', 'UnitPrice', 'TotalAmount', 'FinalAmount', 'CostPrice', 'StockQuantity', 'ReorderLevel', 'SellingPrice']


In [None]:
# impute using pipeline
