In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv("amazon_sales.csv")

# Step 1: Remove currency symbols and commas from price columns
df['discounted_price'] = df['discounted_price'].replace({'₹': '', ',': ''}, regex=True).astype(float)
df['actual_price'] = df['actual_price'].replace({'₹': '', ',': ''}, regex=True).astype(float)

# Step 2: Clean 'discount_percentage' column (remove % and convert to float)
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)

# Step 3: Convert 'rating_count' to integer (remove commas)
df['rating_count'] = df['rating_count'].str.replace(',', '').astype(int)

# Step 4: Convert 'date_added' to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], format='%d-%m-%Y')

# Step 5: Standardize column names (lowercase, replace spaces with underscores)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Step 6: Remove duplicate rows (if any)
df = df.drop_duplicates()

# Optional Step 7: Handle missing values (fill or drop, depending on context)
# Here we fill missing ratings with the column mean
if df['rating'].isnull().sum() > 0:
    df['rating'] = df['rating'].fillna(df['rating'].mean())

# Save the cleaned dataset
df.to_csv("amazon_sales_cleaned.csv", index=False)
print("Cleaned dataset saved as 'amazon_sales_cleaned.csv'")


Cleaned dataset saved as 'amazon_sales_cleaned.csv'
