### Importing the Libraries and Modules

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Loading the Dataset

In [2]:
df = pd.read_csv("/Users/somesh-19583/Desktop/Warehouse/Source Data.csv")
df.head(5)

Unnamed: 0,Product ID,Height (cm),Width (cm),Depth (cm),Avg Daily Demand,Product Category,Score
0,P20106,35.436002,22.622569,33.657885,49.123508,Electronics,6.697377
1,P41070,28.556396,24.461528,44.912779,16.094638,Electronics,86.751324
2,P29033,28.387143,16.138052,18.433578,17.235752,Food,39.181937
3,P47714,23.283766,24.277033,26.857256,6.472163,Clothing,36.539454
4,P14692,9.607678,23.973594,37.116032,15.737616,Food,54.279738


### Understanding the data

In [3]:
df.shape

(1500, 7)

In [4]:
df.columns

Index(['Product ID', 'Height (cm)', 'Width (cm)', 'Depth (cm)',
       'Avg Daily Demand', 'Product Category', 'Score'],
      dtype='object')

In [5]:
col = list(df.columns)
for c in col:
    print(f"Unique values in column '{c}':", df[c].unique())


Unique values in column 'Product ID': ['P20106' 'P41070' 'P29033' ... 'P94537' 'P13285' 'P74710']
Unique values in column 'Height (cm)': [35.43600155 28.55639588 28.38714288 ... 23.96014813 35.93101258
 34.92451264]
Unique values in column 'Width (cm)': [22.62256938 24.46152849 16.13805182 ... 21.24152547 21.51764934
 28.77722169]
Unique values in column 'Depth (cm)': [33.6578847  44.91277935 18.43357777 ... 24.67756979 32.849773
 32.45406721]
Unique values in column 'Avg Daily Demand': [ 49.12350782  16.09463812  17.2357522  ...  44.39963113   5.39719143
 100.2457902 ]
Unique values in column 'Product Category': ['Electronics' 'Food' 'Clothing' 'Toys' 'Furniture']
Unique values in column 'Score': [ 6.69737661 86.75132427 39.18193732 ... 34.2018825  16.43990227
 28.41628057]


In [6]:
df.dtypes

Product ID           object
Height (cm)         float64
Width (cm)          float64
Depth (cm)          float64
Avg Daily Demand    float64
Product Category     object
Score               float64
dtype: object

In [7]:
df.isnull().sum()

Product ID           0
Height (cm)         45
Width (cm)          45
Depth (cm)          45
Avg Daily Demand    45
Product Category     0
Score                0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(0)

### Handling the Missing Values

In [9]:
# List of float columns with missing data
float_cols = ['Height (cm)', 'Width (cm)', 'Depth (cm)', 'Avg Daily Demand']

# Fill missing values in float columns with the mean of each Product Category
df[float_cols] = df.groupby('Product Category')[float_cols].transform(
    lambda x: x.fillna(x.mean())
)


🔍 Explanation:
groupby('Product Category') groups the data by each category.

.transform(lambda x: x.fillna(x.mean())) replaces NaN with the mean within each group.

It works for multiple columns at once when applied to a DataFrame slice (df[float_cols]).

In [10]:
df.isnull().sum()

Product ID          0
Height (cm)         0
Width (cm)          0
Depth (cm)          0
Avg Daily Demand    0
Product Category    0
Score               0
dtype: int64

### Generating Summary Statistics

In [11]:
df.describe()

Unnamed: 0,Height (cm),Width (cm),Depth (cm),Avg Daily Demand,Score
count,1500.0,1500.0,1500.0,1500.0,1500.0
mean,30.796467,20.068831,24.669364,51.293451,50.251611
std,10.270986,7.784547,9.011673,29.409824,28.342403
min,-2.412673,-4.156097,-2.068691,0.002902,0.017552
25%,23.991791,15.052393,18.59033,30.864887,26.442748
50%,30.75905,19.951531,24.860655,50.413994,50.223098
75%,36.755961,25.08861,30.339684,69.08921,73.782073
max,70.190101,51.409902,54.187837,188.554577,99.736618


### Storing the Cleaned Data in a csv

In [12]:
df.to_csv("/Users/somesh-19583/Desktop/Warehouse/Cleaned Data.csv",index=False)
print("The Data saved Successfully in a CSV File!!!")

The Data saved Successfully in a CSV File!!!
