# New Section

In [None]:
# ==========================================
# AMAZON DATASET PIPELINE
# Cleaning + Transformation + Enrichment
# ==========================================

import pandas as pd

# -----------------------------
# 1. DATA EXTRACTION
# -----------------------------
df = pd.read_csv("amazon.csv")
print("Original Shape:", df.shape)

# -----------------------------
# 2. DATA CLEANING
# -----------------------------

# Remove duplicate rows
df = df.drop_duplicates()

# Clean price columns (remove ₹ and commas)
df['discounted_price'] = (
    df['discounted_price']
    .astype(str)
    .str.replace('₹', '', regex=False)
    .str.replace(',', '', regex=False)
)

df['actual_price'] = (
    df['actual_price']
    .astype(str)
    .str.replace('₹', '', regex=False)
    .str.replace(',', '', regex=False)
)

# Convert to numeric
df['discounted_price'] = pd.to_numeric(df['discounted_price'], errors='coerce')
df['actual_price'] = pd.to_numeric(df['actual_price'], errors='coerce')

# Clean discount percentage
df['discount_percentage'] = (
    df['discount_percentage']
    .astype(str)
    .str.replace('%', '', regex=False)
)

df['discount_percentage'] = pd.to_numeric(df['discount_percentage'], errors='coerce')

# Clean rating
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Clean rating_count
df['rating_count'] = (
    df['rating_count']
    .astype(str)
    .str.replace(',', '', regex=False)
)

df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')

# Handle missing values (numeric columns)
df.fillna(df.median(numeric_only=True), inplace=True)

print("After Cleaning Shape:", df.shape)

# -----------------------------
# 3. DATA TRANSFORMATION
# -----------------------------

# Normalize rating (0–1 scale)
df['rating_normalized'] = (df['rating'] - df['rating'].min()) / (
    df['rating'].max() - df['rating'].min()
)

# Log transform rating_count (to reduce skewness)
df['log_rating_count'] = df['rating_count'].apply(lambda x: 0 if x <= 0 else pd.np.log(x))

# Convert discount percentage to fraction
df['discount_fraction'] = df['discount_percentage'] / 100

# -----------------------------
# 4. DATA ENRICHMENT (Feature Engineering)
# -----------------------------

# Savings amount
df['savings'] = df['actual_price'] - df['discounted_price']

# Extract main category
df['main_category'] = df['category'].apply(lambda x: str(x).split('|')[0])

# Create popularity score
df['popularity_score'] = df['rating'] * df['rating_count']

# Create price segment
df['price_segment'] = pd.cut(
    df['discounted_price'],
    bins=3,
    labels=["Low", "Medium", "High"]
)

print("\nNew Features Added:")
print([
    "rating_normalized",
    "log_rating_count",
    "discount_fraction",
    "savings",
    "main_category",
    "popularity_score",
    "price_segment"
])

# -----------------------------
# 5. SAVE CLEANED DATASET
# -----------------------------

df.to_csv("amazon_cleaned_enriched.csv", index=False)

print("\nPipeline Completed Successfully")
print("Cleaned file saved as: amazon_cleaned_enriched.csv")
