In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("amazon.csv")

# Step 1: Rename columns to lowercase and replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Step 2: Drop duplicate rows
df.drop_duplicates(inplace=True)

# Step 3: Handle missing values - fill rating_count nulls with '0'
df['rating_count'] = df['rating_count'].fillna('0')

# Step 4: Clean and convert price, discount, rating and rating_count fields
df['discounted_price'] = df['discounted_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['actual_price'] = df['actual_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)
df['rating_count'] = df['rating_count'].str.replace(',', '').astype(int)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Step 5: Save cleaned dataset
df.to_csv("amazon_cleaned.csv", index=False)

print("✅ Dataset cleaned and saved as 'amazon_cleaned.csv'")


✅ Dataset cleaned and saved as 'amazon_cleaned.csv'
