In [None]:
import pandas as pd
import numpy as np
import unicodedata, re

IN_PATH = "integrated_data.xlsx"
OUT_PATH = "integrated_data_transformed.xlsx"

def strip_accents_lower(s):
    """Dùng cho matching/dedup: bỏ dấu + lower (KHÔNG ghi đè cột gốc)."""
    if pd.isna(s): 
        return s
    s = str(s).strip().lower()
    nfkd = unicodedata.normalize("NFKD", s)
    return "".join(c for c in nfkd if not unicodedata.combining(c))

# ========== LOAD ==========
df = pd.read_excel(IN_PATH)

# ========== TRANSFORMATIONS ==========
# 1) Chuẩn hoá hỗ trợ matching (không ảnh hưởng hiển thị)
if "name" in df.columns:
    df["name_norm"] = df["name"].astype(str).map(strip_accents_lower)
if "brand" in df.columns:
    df["brand_norm"] = df["brand"].astype(str).map(strip_accents_lower)
if "category_l1" in df.columns:
    df["category_l1_norm"] = df["category_l1"].astype(str).map(strip_accents_lower)

# 2) Làm sạch/chuẩn hoá discount
if "discount_percent" in df.columns:
    dp = pd.to_numeric(df["discount_percent"], errors="coerce")
    df["discount_percent_clean"] = dp.clip(lower=-5, upper=100).round(2)
    df["discount_flag"] = df["discount_percent_clean"].fillna(0).gt(0)

# 3) Đặc trưng từ price / rating
if "price" in df.columns:
    price = pd.to_numeric(df["price"], errors="coerce")
    df["price_log1p"] = np.where(price > 0, np.log1p(price), np.nan)

if "rating_average" in df.columns:
    df["rating_round_1"] = pd.to_numeric(df["rating_average"], errors="coerce").round(1)

# 4) Đặc trưng chuỗi tên
if "name" in df.columns:
    df["name_len"] = df["name"].astype(str).str.len()
    df["name_word_count"] = df["name"].astype(str).str.split().str.len()

# ========== REORDER (tùy chọn, giữ style từ integration) ==========
front_cols = [c for c in ["category_l1","price","discount_percent","rating_average","name","brand","seller_id"] if c in df.columns]
end_cols   = [c for c in ["image_path","thumbnail_url","impression_info_0_metadata_delivery_zone"] if c in df.columns]
others     = [c for c in df.columns if c not in set(front_cols + end_cols)]
df = df[front_cols + others + end_cols]

# ========== SAVE ==========
df.to_excel(OUT_PATH, index=False)
print("Saved:", OUT_PATH, "with shape:", df.shape)


Saved: integrated_data_transformed.xlsx with shape: (499, 21)
