In [None]:
import pandas as pd
import numpy as np

# Load cleaned dataset
df = pd.read_csv('../amazon_sales_cleaned.csv')

# Ensuring data types are correct
df['delivery_date'] = pd.to_datetime(df['delivery_date'], errors='coerce')
df['data_collected_at'] = pd.to_datetime(df['data_collected_at'], errors='coerce')

categorical_cols = ['product_category','is_best_seller','is_sponsored','has_coupon','buy_box_availability']
for col in categorical_cols:
    df[col] = df[col].astype('category')

print(df.info())

In [None]:
# Min-Max normalization for Sales Score
min_sales = df['purchased_last_month'].min()
max_sales = df['purchased_last_month'].max()

df['Sales_Score'] = (df['purchased_last_month'] - min_sales) / (max_sales - min_sales)

# Check summary statistics
print(df['Sales_Score'].describe())
print(df[['purchased_last_month', 'Sales_Score']].head())

In [None]:
# Min-Max normalization for Rating Score
min_rating = df['product_rating'].min()
max_rating = df['product_rating'].max()

df['Rating_Score'] = (df['product_rating'] - min_rating) / (max_rating - min_rating)

# Check summary statistics
print(df['Rating_Score'].describe())
print(df[['product_rating', 'Rating_Score']].head())


In [None]:
# Approximate Return Rate = total_reviews / purchased_last_month
df['Return_Rate'] = df['total_reviews'] / df['purchased_last_month']

# Normalize and invert (1 - normalized Return_Rate)
min_rr = df['Return_Rate'].min()
max_rr = df['Return_Rate'].max()

df['Return_Rate_Score'] = 1 - ((df['Return_Rate'] - min_rr) / (max_rr - min_rr))

# Check summary stats
print(df['Return_Rate_Score'].describe())
print(df[['total_reviews', 'purchased_last_month', 'Return_Rate_Score']].head())

In [None]:
# Define important columns for completeness
important_cols = [
    'product_title', 'product_rating', 'total_reviews',
    'discounted_price', 'original_price', 'product_image_url',
    'product_page_url', 'delivery_date'
]

df['Completeness_Score'] = df[important_cols].notnull().mean(axis=1)

# Check summary stats
print(df['Completeness_Score'].describe())
print(df[['product_title','Completeness_Score']].head())


In [None]:
df['Health_Score'] = (
    0.4 * df['Sales_Score'] +
    0.3 * df['Rating_Score'] +
    0.15 * df['Return_Rate_Score'] +
    0.15 * df['Completeness_Score']
)

# Summary stats
print(df['Health_Score'].describe())
print(df[['product_title','Health_Score']].head())


In [None]:
df.to_csv('../amazon_sales_with_health_scores.csv', index=False)