# **Importing Libraries**

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

In [7]:
df = pd.read_csv('/kaggle/input/retail-fashion-boutique-data-sales-analytics-2025/fashion_boutique_dataset.csv')
print(df.columns)

Index(['product_id', 'category', 'brand', 'season', 'size', 'color',
       'original_price', 'markdown_percentage', 'current_price',
       'purchase_date', 'stock_quantity', 'customer_rating', 'is_returned',
       'return_reason'],
      dtype='object')


# **Handling Missing and Duplicate values**

In [8]:
average_rating = df.customer_rating.mean().round()
df['customer_rating'] = df['customer_rating'].fillna(average_rating)
df.customer_rating.isnull().sum()
df['product_id'].drop_duplicates()
df.head()

Unnamed: 0,product_id,category,brand,season,size,color,original_price,markdown_percentage,current_price,purchase_date,stock_quantity,customer_rating,is_returned,return_reason
0,FB000001,Outerwear,Zara,Spring,XL,Red,196.01,0.0,196.01,2025-07-05,37,3.0,False,
1,FB000002,Tops,Uniqlo,Winter,L,Pink,119.64,0.0,119.64,2025-08-06,2,2.5,False,
2,FB000003,Accessories,Uniqlo,Winter,,Black,33.8,0.0,33.8,2025-08-06,22,4.3,False,
3,FB000004,Shoes,Uniqlo,Spring,XL,Black,75.36,0.0,75.36,2025-07-07,48,2.6,False,
4,FB000005,Tops,Banana Republic,Winter,XL,Black,105.02,0.0,105.02,2025-08-06,10,3.0,False,


# **Performing EDA**

In [9]:
md_percentages = df.markdown_percentage.unique()
max_md_percentage = df.markdown_percentage.max()
min_md_percentage = df.markdown_percentage.min()

print(f'Max %:{max_md_percentage}\nMin% :{min_md_percentage}')
md_percentages.size

Max %:59.9
Min% :0.0


458

In [10]:
og_price_null_vals = df.original_price.isnull().sum()
avg_og_price = df.original_price.mean().round(3)

print(f'Null values : {og_price_null_vals}\nAverage Price : {avg_og_price}')

Null values : 0
Average Price : 97.2


In [19]:
brands = df.brand.unique()
df['size'] = df['size'].fillna('Unkown')

brand_by_color = df.pivot_table(index='brand', columns='color', aggfunc='size', fill_value=0)
brand_by_color

color,Beige,Black,Blue,Brown,Gray,Green,Navy,Pink,Purple,Red,White
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Ann Taylor,30,27,15,14,28,22,27,22,20,19,33
Banana Republic,17,22,30,31,27,33,26,15,28,28,28
Forever21,17,24,22,22,15,28,24,30,17,26,25
Gap,17,31,23,18,25,22,26,17,22,24,19
H&M,25,30,27,21,19,30,30,23,23,26,29
Mango,19,29,23,27,14,31,24,42,25,26,24
Uniqlo,22,27,25,19,29,26,17,28,23,25,19
Zara,29,26,26,28,32,30,18,27,29,38,30


# **Pie Chart of different Brands by Color**

In [12]:
# for brand in brands:
#     brand_data = df[df['brand'] == brand]['color'].value_counts()
#     plt.figure(figsize=(6,6))
#     plt.pie(brand_data, labels=brand_data.index, autopct='%1.1f%%', startangle=90)
#     plt.title(f"Color distribution for {brand}")
#     plt.show()

In [13]:
df.head()

Unnamed: 0,product_id,category,brand,season,size,color,original_price,markdown_percentage,current_price,purchase_date,stock_quantity,customer_rating,is_returned,return_reason
0,FB000001,Outerwear,Zara,Spring,XL,Red,196.01,0.0,196.01,2025-07-05,37,3.0,False,
1,FB000002,Tops,Uniqlo,Winter,L,Pink,119.64,0.0,119.64,2025-08-06,2,2.5,False,
2,FB000003,Accessories,Uniqlo,Winter,Unkown,Black,33.8,0.0,33.8,2025-08-06,22,4.3,False,
3,FB000004,Shoes,Uniqlo,Spring,XL,Black,75.36,0.0,75.36,2025-07-07,48,2.6,False,
4,FB000005,Tops,Banana Republic,Winter,XL,Black,105.02,0.0,105.02,2025-08-06,10,3.0,False,


In [18]:
df.return_reason.unique()
df['return_reason'] = df['return_reason'].fillna('Other')
df.return_reason.unique()

array(['Other', 'Color Mismatch', 'Size Issue', 'Damaged',
       'Quality Issue', 'Changed Mind', 'Wrong Item'], dtype=object)

In [42]:
df['purchase_date'] = pd.to_datetime(df['purchase_date'])

df['purchase_month'] = df['purchase_date'].dt.month_name()
df['purchase_year'] = df['purchase_date'].dt.year

df.drop('purchase_date',axis = 1,inplace = True)
df.head()

Unnamed: 0,product_id,category,brand,season,size,color,original_price,markdown_percentage,current_price,stock_quantity,customer_rating,is_returned,return_reason,purchase_month,purchase_year
0,FB000001,Outerwear,Zara,Spring,XL,Red,196.01,0.0,196.01,37,3.0,False,Other,July,2025
1,FB000002,Tops,Uniqlo,Winter,L,Pink,119.64,0.0,119.64,2,2.5,False,Other,August,2025
2,FB000003,Accessories,Uniqlo,Winter,Unkown,Black,33.8,0.0,33.8,22,4.3,False,Other,August,2025
3,FB000004,Shoes,Uniqlo,Spring,XL,Black,75.36,0.0,75.36,48,2.6,False,Other,July,2025
4,FB000005,Tops,Banana Republic,Winter,XL,Black,105.02,0.0,105.02,10,3.0,False,Other,August,2025


array([False,  True])