# Data cleaning notebook for "product dataset"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
dataset = pd.read_csv('products.csv')
c_df = dataset.copy()
c_df.head()

Unnamed: 0,sku_id,product_name,category,brand,MRP,base_cost,launch_date
0,P0001,Program Go,Electronics,,1308.75,823.9,2024-07-07
1,P0002,Whole Max,Apparel,BrandD,1465.23,854.11,2023-09-02
2,P0003,Happy Plus,Electronics,BrandE,537.82,353.3,2021-08-14
3,P0004,Sure Go,Beauty,BrandA,532.78,328.46,2022-07-16
4,P0005,Though Go,Sports,BrandD,1316.92,769.06,2022-06-18


In [None]:
df_product = pd.DataFrame(c_df)
df_product

Unnamed: 0,sku_id,product_name,category,brand,MRP,base_cost,launch_date
0,P0001,Program Go,Electronics,,1308.75,823.90,2024-07-07
1,P0002,Whole Max,Apparel,BrandD,1465.23,854.11,2023-09-02
2,P0003,Happy Plus,Electronics,BrandE,537.82,353.30,2021-08-14
3,P0004,Sure Go,Beauty,BrandA,532.78,328.46,2022-07-16
4,P0005,Though Go,Sports,BrandD,1316.92,769.06,2022-06-18
...,...,...,...,...,...,...,...
1495,P1496,tree go,Sports,BrandE,1519.69,832.34,2023-01-09
1496,P1497,Start Lite,Beauty,BrandB,1506.48,903.68,2021-04-16
1497,P1498,Practice Lite,Home & Kitchen,BrandB,801.53,462.18,2023-12-23
1498,P1499,nature x,Beauty,BrandB,1290.21,,2022-05-02


In [None]:
#initial audit
df_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sku_id        1500 non-null   object 
 1   product_name  1500 non-null   object 
 2   category      1500 non-null   object 
 3   brand         1354 non-null   object 
 4   MRP           1345 non-null   float64
 5   base_cost     1347 non-null   float64
 6   launch_date   1500 non-null   object 
dtypes: float64(2), object(5)
memory usage: 82.2+ KB


In [None]:
df_product.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MRP,1345.0,848.720178,443.830127,78.75,487.3,808.8,1189.6,1910.44
base_cost,1347.0,546.25121,265.675363,101.27,321.695,538.41,784.11,999.01


In [None]:
# how many missing data points we have?
df_product.isnull().sum()

Unnamed: 0,0
sku_id,0
product_name,0
category,0
brand,146
MRP,155
base_cost,153
launch_date,0


In [None]:
#lets look at some column data types which are suspicious
df_product.dtypes['launch_date']

dtype('O')

In [None]:
# df_product['brand'].head(20)
df_product['brand'] = df_product['brand'].fillna("No Brand")

In [None]:
# handling missing value
df_product['mrp_missing'] = df_product['MRP'].isna().astype(int)
df_product['base_cost_missing'] = df_product['base_cost'].isna().astype(int)

In [None]:
#impute MRP
df_product['MRP'] = df_product.groupby('category')['MRP'].transform(
    lambda x: x.fillna(x.median()))

In [None]:
df_product['base_cost'] = df_product.groupby('category')['base_cost'].transform(
    lambda x: x.fillna(x.median())
)

In [None]:
#verify
df_product.isnull().sum()

Unnamed: 0,0
sku_id,0
product_name,0
category,0
brand,0
MRP,0
base_cost,0
launch_date,0
mrp_missing,0
base_cost_missing,0


In [None]:
# date data type
df_product['launch_date'] = pd.to_datetime(df_product['launch_date'], errors='coerce')

print(df_product.dtypes['launch_date'])

datetime64[ns]


In [None]:
df_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   sku_id             1500 non-null   object        
 1   product_name       1500 non-null   object        
 2   category           1500 non-null   object        
 3   brand              1500 non-null   object        
 4   MRP                1500 non-null   float64       
 5   base_cost          1500 non-null   float64       
 6   launch_date        1500 non-null   datetime64[ns]
 7   mrp_missing        1500 non-null   int64         
 8   base_cost_missing  1500 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 105.6+ KB


In [None]:
#check for outliers
def outliers_IQR(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[col].apply(lambda x: 1 if x < lower_bound or x > upper_bound else 0)


In [None]:
# Flag outliers
df_product['mrp_outlier'] = outliers_IQR(df_product, 'MRP')
df_product['base_cost_outlier'] = outliers_IQR(df_product, 'base_cost')

In [None]:
# Check how many outliers are detected
print("MRP outliers:", df_product['mrp_outlier'].sum())
print("Base cost outliers:", df_product['base_cost_outlier'].sum())


MRP outliers: 0
Base cost outliers: 0


**In any retail or e-commerce business, the margin (MRP - base_cost) is the profit before applying discounts or dynamic price strategies.**

It answers:
- How much are we making on this product?
- Can we afford to discount this item?
- What’s the price elasticity — can we raise prices without killing profit?

In [None]:
df_product['margin'] = df_product['MRP'] - df_product['base_cost']

In [None]:
# Flag records where margin is negative (potential data or business issue)
df_product['negative_margin'] = df_product['margin'] < 0

In [None]:
df_product[['MRP', 'base_cost', 'margin', 'negative_margin']].head(10)

Unnamed: 0,MRP,base_cost,margin,negative_margin
0,1308.75,823.9,484.85,False
1,1465.23,854.11,611.12,False
2,537.82,353.3,184.52,False
3,532.78,328.46,204.32,False
4,1316.92,769.06,547.86,False
5,400.84,238.49,162.35,False
6,874.95,602.155,272.795,False
7,264.76,602.155,-337.395,True
8,385.47,289.1,96.37,False
9,442.5,315.85,126.65,False


### What signals would I want to know if I were deciding pricing strategy?

In [None]:
from datetime import datetime
today = pd.to_datetime("2025-07-09")
# Age of product in months
df_product['product_age_months'] = df_product['launch_date'].apply(lambda x: (today.year - x.year) * 12 + (today.month - x.month))

# Flag for product age buckets
df_product['is_new'] = df_product['product_age_months'] <= 3
df_product['is_stale'] = df_product['product_age_months'] > 18

In [None]:
# Price-cost ratio (efficiency of pricing)
df_product['price_cost_ratio'] = df_product['MRP'] / df_product['base_cost']

In [None]:
# High margin flag: top 25% margin products
margin_75th = df_product['margin'].quantile(0.75)
df_product['high_margin'] = df_product['margin'] >= margin_75th

In [None]:
# Low margin flag
df_product['low_margin'] = df_product['margin'] < df_product['margin'].quantile(0.25)

In [None]:
df_product.head()

Unnamed: 0,sku_id,product_name,category,brand,MRP,base_cost,launch_date,mrp_missing,base_cost_missing,mrp_outlier,base_cost_outlier,margin,negative_margin,product_age_months,is_new,is_stale,price_cost_ratio,high_margin,low_margin
0,P0001,Program Go,Electronics,No Brand,1308.75,823.9,2024-07-07,0,0,0,0,484.85,False,12,False,False,1.588482,True,False
1,P0002,Whole Max,Apparel,BrandD,1465.23,854.11,2023-09-02,0,0,0,0,611.12,False,22,False,True,1.715505,True,False
2,P0003,Happy Plus,Electronics,BrandE,537.82,353.3,2021-08-14,0,0,0,0,184.52,False,47,False,True,1.522276,False,False
3,P0004,Sure Go,Beauty,BrandA,532.78,328.46,2022-07-16,0,0,0,0,204.32,False,36,False,True,1.622054,False,False
4,P0005,Though Go,Sports,BrandD,1316.92,769.06,2022-06-18,0,0,0,0,547.86,False,37,False,True,1.712376,True,False


In [None]:
cleaned_filename = 'cleaned_products.csv'
df_product.to_csv(cleaned_filename, index=False)

# # 3. (FOR COLAB ONLY) Create download link
# from google.colab import files
# files.download(cleaned_filename)


# **Light exploratory data analysis**