In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np

# Step 2: Load the dataset from CSV
df = pd.read_csv("sales_data.csv")
print("Original Sales Data:\n", df.head())

# Step 3: Check for Missing Values
print("\nMissing Values in Each Column:\n", df.isnull().sum())

# Step 4: Drop Rows with Critical Missing Values (e.g., Product_ID or Sales_Amount)
df = df.dropna(subset=["Product_ID", "Sales_Amount"])
print("\nAfter Dropping Critical Missing Values:\n", df.head())

# Step 5: Fill Missing Price with Average Price per Category
df["Price"] = df.groupby("Category")["Price"].transform(lambda x: x.fillna(x.mean()))

# Step 6: Fill Missing Customer_Location using forward fill and backward fill
df["Customer_Location"].fillna(method="ffill", inplace=True)
df["Customer_Location"].fillna(method="bfill", inplace=True)

# Step 7: Fill Missing Purchase_Date with the Most Frequent Date
most_freq_date = df["Purchase_Date"].mode()[0]
df["Purchase_Date"].fillna(most_freq_date, inplace=True)

# Step 8: Convert Purchase_Date to datetime
df["Purchase_Date"] = pd.to_datetime(df["Purchase_Date"])

# Step 9: Standardize Column Names
df = df.rename(columns={
    "Sales_Amount": "Total_Sales",
    "Purchase_Date": "Date_Of_Purchase",
    "Customer_Location": "Location"
})

# Step 10: Remove Duplicate Transactions
df = df.drop_duplicates()

# Step 11: Change Product_ID to string
df["Product_ID"] = df["Product_ID"].astype(str)

# Step 12: Correct Payment_Method typos
df["Payment_Method"] = df["Payment_Method"].replace({"Csh": "Cash"})

# Step 13: Sort by Purchase_Date
df = df.sort_values(by="Date_Of_Purchase").reset_index(drop=True)

# Step 14: Save cleaned dataset to a new CSV file
df.to_csv("cleaned_sales_data.csv", index=False)

print("\nFinal Cleaned Sales Data:\n", df.head())


Original Sales Data:
    Product_ID Product_Name     Category  Quantity    Price  Sales_Amount  \
0       101.0        Shirt     Clothing         2    500.0        1000.0   
1       102.0       Laptop  Electronics         1  45000.0       45000.0   
2       103.0        Shoes     Footwear         3   2500.0        7500.0   
3         NaN        Phone  Electronics         1      NaN       20000.0   
4       105.0        Watch  Accessories         2   1500.0           NaN   

  Customer_Location Purchase_Date Payment_Method  
0          New York     1/15/2023           Cash  
1               NaN     1/16/2023            Csh  
2           Chicago           NaN         Credit  
3       Los Angeles     1/18/2023          credt  
4           Houston     1/19/2023          Debit  

Missing Values in Each Column:
 Product_ID           1
Product_Name         0
Category             0
Quantity             0
Price                2
Sales_Amount         1
Customer_Location    3
Purchase_Date        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Customer_Location"].fillna(method="ffill", inplace=True)
  df["Customer_Location"].fillna(method="ffill", inplace=True)
  df["Customer_Location"].fillna(method="bfill", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Purchase_Date"].fillna(most_freq_date, in