# Product Dataset Cleaning

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Import Dataset

ds =  pd.read_csv("product_data.csv")
ds

Unnamed: 0,CustomerID,Product,Price,Quantity,City,PurchaseDate
0,1052,Desktop,,4.0,Bangalore,2021-06-30
1,1041,Desktop,,3.0,Delhi,2023-01-21
2,1057,Smartphone,forty-five thousand,2.0,Bangalore,2023-04-06
3,1038,Tablet,45000.0,1.0,Chennai,2023-07-05
4,1013,Smartphone,45000.0,,Mumbai,2022-04-11
...,...,...,...,...,...,...
120,1074,Laptop,45000.0,,Bangalore,2021-08-14
121,1017,Desktop,45000.0,,Pune,2022-08-09
122,1075,Desktop,30000.0,1.0,Pune,2021-09-28
123,1008,Laptop,60000.0,1.0,Chennai,2021-09-13


In [4]:
# Print first some rows
ds.head(10)

Unnamed: 0,CustomerID,Product,Price,Quantity,City,PurchaseDate
0,1052,Desktop,,4.0,Bangalore,2021-06-30
1,1041,Desktop,,3.0,Delhi,2023-01-21
2,1057,Smartphone,forty-five thousand,2.0,Bangalore,2023-04-06
3,1038,Tablet,45000.0,1.0,Chennai,2023-07-05
4,1013,Smartphone,45000.0,,Mumbai,2022-04-11
5,1094,Laptop,60000.0,two,Mumbai,2022-03-12
6,1004,Laptop,30000.0,2.0,Chennai,2022-02-25
7,1034,Laptop,30000.0,2.0,Chennai,2022-12-22
8,1086,Laptop,30000.0,3.0,Delhi,2022-10-23
9,1092,Desktop,30000.0,3.0,Mumbai,2022-03-12


In [5]:
# Print last some rows
ds.tail(10)

Unnamed: 0,CustomerID,Product,Price,Quantity,City,PurchaseDate
115,1059,Desktop,,3.0,Bangalore,2023-07-05
116,1049,Smartphone,30000.0,2.0,Bangalore,2022-11-07
117,1027,Smartphone,60000.0,4.0,Chennai,2023-08-04
118,1091,Smartphone,30000.0,,Mumbai,2021-01-16
119,1040,Smartphone,15000.0,1.0,Mumbai,2023-03-07
120,1074,Laptop,45000.0,,Bangalore,2021-08-14
121,1017,Desktop,45000.0,,Pune,2022-08-09
122,1075,Desktop,30000.0,1.0,Pune,2021-09-28
123,1008,Laptop,60000.0,1.0,Chennai,2021-09-13
124,1073,Laptop,15000.0,,Delhi,2021-01-01


In [6]:
# Describe data

ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    125 non-null    int64 
 1   Product       125 non-null    object
 2   Price         103 non-null    object
 3   Quantity      98 non-null     object
 4   City          125 non-null    object
 5   PurchaseDate  125 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.0+ KB


In [7]:
# 

ds.describe()

Unnamed: 0,CustomerID
count,125.0
mean,1045.936
std,28.183942
min,1000.0
25%,1019.0
50%,1044.0
75%,1071.0
max,1098.0


In [8]:
# Searching for duplicates

ds.duplicated().sum()

np.int64(5)

In [9]:
# Removing duplicates

ds.drop_duplicates(inplace = True)

In [10]:
# To get columns names(header) only

ds.columns

Index(['CustomerID', 'Product', 'Price', 'Quantity', 'City', 'PurchaseDate'], dtype='object')

In [11]:
# Handling Price column

ds['Price'] = pd.to_numeric(ds['Price'],errors='coerce')

In [12]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 119
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CustomerID    120 non-null    int64  
 1   Product       120 non-null    object 
 2   Price         97 non-null     float64
 3   Quantity      96 non-null     object 
 4   City          120 non-null    object 
 5   PurchaseDate  120 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.6+ KB


In [13]:
# Handiling Quantity column

ds["Quantity"].fillna("Null",inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ds["Quantity"].fillna("Null",inplace = True)


In [14]:
# Handiling Price column

ds["Price"].fillna("Null",inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ds["Price"].fillna("Null",inplace = True)
  ds["Price"].fillna("Null",inplace = True)


In [15]:
print(ds)

     CustomerID     Product    Price Quantity       City PurchaseDate
0          1052     Desktop     Null      4.0  Bangalore   2021-06-30
1          1041     Desktop     Null      3.0      Delhi   2023-01-21
2          1057  Smartphone     Null      2.0  Bangalore   2023-04-06
3          1038      Tablet  45000.0      1.0    Chennai   2023-07-05
4          1013  Smartphone  45000.0     Null     Mumbai   2022-04-11
..          ...         ...      ...      ...        ...          ...
115        1059     Desktop     Null      3.0  Bangalore   2023-07-05
116        1049  Smartphone  30000.0      2.0  Bangalore   2022-11-07
117        1027  Smartphone  60000.0      4.0    Chennai   2023-08-04
118        1091  Smartphone  30000.0     Null     Mumbai   2021-01-16
119        1040  Smartphone  15000.0      1.0     Mumbai   2023-03-07

[120 rows x 6 columns]


In [20]:
# Save data into new file

ds.to_csv("product_data")