In [89]:
import pandas as pd

In [90]:
df = pd.read_csv("uncleaned_sales_data.csv")
df.head()

Unnamed: 0,OrderID,ProductName,QuantitySold,Price,Date
0,101,Laptop,2.0,1000.0,2021-09-01
1,102,Mouse,1.0,50.5,2021-09-02
2,103,Keyboard,2.0,30.0,2021-09-03
3,103,Keyboard,,30.0,2021-09-03
4,105,,3.0,,


In [91]:
df['OrderID'].duplicated().sum()

1

In [92]:
# Remove any duplicate rows based on the OrderID.
df.drop_duplicates(subset=['OrderID'], keep=False, inplace=True)
df

Unnamed: 0,OrderID,ProductName,QuantitySold,Price,Date
0,101,Laptop,2.0,1000.0,2021-09-01
1,102,Mouse,1.0,50.5,2021-09-02
4,105,,3.0,,
5,106A,Monitor,4.0,70.2,2021-09-05


In [93]:
df['OrderID'].duplicated().sum()

0

In [94]:
# Fill in any missing values in the ProductName column with the string ‘UNKNOWN’.
df['ProductName'] = df['ProductName'].fillna('UNKNOWN')
df

Unnamed: 0,OrderID,ProductName,QuantitySold,Price,Date
0,101,Laptop,2.0,1000.0,2021-09-01
1,102,Mouse,1.0,50.5,2021-09-02
4,105,UNKNOWN,3.0,,
5,106A,Monitor,4.0,70.2,2021-09-05


In [95]:
# Replace any non-numeric values in the QuantitySold column with the median of the other values.
df["QuantitySold"] = pd.to_numeric(df['QuantitySold'], errors='coerce').fillna(df['QuantitySold'].median())
df

Unnamed: 0,OrderID,ProductName,QuantitySold,Price,Date
0,101,Laptop,2.0,1000.0,2021-09-01
1,102,Mouse,1.0,50.5,2021-09-02
4,105,UNKNOWN,3.0,,
5,106A,Monitor,4.0,70.2,2021-09-05


In [96]:
# Correct the data types for the Price column, and fill missing values with the mean price.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 5
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   OrderID       4 non-null      object 
 1   ProductName   4 non-null      object 
 2   QuantitySold  4 non-null      float64
 3   Price         3 non-null      float64
 4   Date          3 non-null      object 
dtypes: float64(2), object(3)
memory usage: 144.0+ bytes


In [98]:
df['Price'] = df['Price'].fillna(df['Price'].mean()).round(2)
df

Unnamed: 0,OrderID,ProductName,QuantitySold,Price,Date
0,101,Laptop,2.0,1000.0,2021-09-01
1,102,Mouse,1.0,50.5,2021-09-02
4,105,UNKNOWN,3.0,373.57,
5,106A,Monitor,4.0,70.2,2021-09-05


In [108]:
# Make sure the Date column is in the correct date format. Replace any invalid dates with the most frequent date.
df['Date'] = df['Date'].astype('datetime64[ns]')
most_recent_date = max(df['Date'])
df['Date'].fillna(most_recent_date, inplace=True)
df

Unnamed: 0,OrderID,ProductName,QuantitySold,Price,Date
0,101,Laptop,2.0,1000.0,2021-09-01
1,102,Mouse,1.0,50.5,2021-09-02
4,105,UNKNOWN,3.0,373.57,2021-09-05
5,106A,Monitor,4.0,70.2,2021-09-05
