In [None]:
import pandas as pd
import numpy as np

# Load cleaned dataset from Step 1
df = pd.read_csv('../amazon_sales.csv')

# Ensure datetime and categorical types are correct (from Step 1d)
df['delivery_date'] = pd.to_datetime(df['delivery_date'], errors='coerce')
df['data_collected_at'] = pd.to_datetime(df['data_collected_at'], errors='coerce')

categorical_cols = ['product_category','is_best_seller','is_sponsored','has_coupon','buy_box_availability']
for col in categorical_cols:
    df[col] = df[col].astype('category')

print(df.dtypes)

In [None]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_summary = pd.DataFrame({'missing_count': missing, 'missing_percent': missing_percent})
print(missing_summary)


In [None]:
# Numeric columns
numeric_cols = ['product_rating', 'total_reviews', 'purchased_last_month', 
                'discounted_price', 'original_price', 'discount_percentage']
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns
categorical_cols = ['buy_box_availability', 'product_page_url', 'is_best_seller', 
                    'is_sponsored', 'has_coupon', 'product_category']

for col in categorical_cols:
    if str(df[col].dtype) == 'category':
        df[col] = df[col].cat.add_categories('Unknown')
    df[col] = df[col].fillna('Unknown')

# Verify missing values
missing_after = df.isnull().sum()
missing_percent_after = (missing_after / len(df)) * 100
missing_summary_after = pd.DataFrame({'missing_count_after': missing_after, 
                                      'missing_percent_after': missing_percent_after})
print(missing_summary_after)

In [None]:
# Outlier Detection & Handling for 'total_reviews'

col = 'total_reviews'

# Calculate Q1, Q3, IQR
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"{col} - Before capping:")
print(df[col].describe())

# Cap outliers
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

print(f"\n{col} - After capping:")
print(df[col].describe())

In [None]:
# Outlier Detection & Handling for 'purchased_last_month'
col = 'purchased_last_month'

# Calculate Q1, Q3, IQR
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"{col} - Before capping:")
print(df[col].describe())

# Cap outliers
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

print(f"\n{col} - After capping:")
print(df[col].describe())

In [None]:
# Outlier Detection & Handling for 'discounted_price'

col = 'discounted_price'

# Calculate Q1, Q3, IQR
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"{col} - Before capping:")
print(df[col].describe())

# Cap outliers
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

print(f"\n{col} - After capping:")
print(df[col].describe())

In [None]:
# Outlier Detection & Handling for 'original_price'

col = 'original_price'

# Calculate Q1, Q3, IQR
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"{col} - Before capping:")
print(df[col].describe())

# Cap outliers
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

print(f"\n{col} - After capping:")
print(df[col].describe())

In [None]:
# Outlier Detection & Handling for 'discount_percentage'

col = 'discount_percentage'

# Calculate Q1, Q3, IQR
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"{col} - Before capping:")
print(df[col].describe())

# Cap outliers
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

print(f"\n{col} - After capping:")
print(df[col].describe())

In [None]:
df.to_csv('amazon_sales_cleaned.csv', index=False)