In [1]:
%pwd

'c:\\Users\\sam\\End-to-End-Fashion-Recommendation-System-with-MLOps\\research'

In [2]:
import os
os.chdir('../')

In [58]:
%pwd

'c:\\Users\\sam\\End-to-End-Fashion-Recommendation-System-with-MLOps'

In [125]:
import pandas as pd 
import regex as re
import pickle

In [134]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Data_transformation:
    valid_data : Path
    transformed_data: Path
    model_path : Path



In [135]:
from src.recommendation_system.utils.common import read_yaml , create_dir
from src.recommendation_system.logging import logger
from src.recommendation_system.constants import CONFIG_PATH
from src.recommendation_system.logging import logger

In [141]:
class Config_manager:

    def __init__(self, config_path=CONFIG_PATH):
        self.config = read_yaml(config_path)
        
        create_dir([self.config.artifacts_root])

    def get_data_transformation(self) -> Data_transformation:
        
        config = self.config.data_transformation
        
        create_dir([config.transformed_data,
                    config.model_path])

        data_transformation_config = Data_transformation(
            valid_data = config.valid_data,
            transformed_data=config.transformed_data,
            model_path=config.model_path
        )

        return data_transformation_config

In [146]:
import pandas as pd
import numpy as np

class Data_transformation_check:

    def __init__(self, config):
        # config should contain paths like:
        # config.valid_data
        # config.transformed_data
        self.config = config

    # ── Fix price missing values ─────────────────────────
    def fix_price(self, df: pd.DataFrame) -> pd.DataFrame:
        missing = df['price'].isna().sum()
        median_val = df['price'].median()

        df['price'] = df['price'].fillna(median_val)
        df['price'] = np.log1p(df['price'])
        logger.info(
            "[Price] %d missing -> median=%.2f",
            missing,
            median_val
        )
        return df

    # ── Fix review_count ────────────────────────────────
    def fix_review_count(self, df: pd.DataFrame) -> pd.DataFrame:

        def parse_review(x):
            if pd.isna(x):
                return 0

            x = str(x).lower().strip()
            try:
                if 'k' in x:
                    return int(float(x.replace('k', '')) * 1000)
                elif 'l' in x:
                    return int(float(x.replace('l', '')) * 100000)
                else:
                    return int(float(x))
            except ValueError:
                return 0

        missing = df['review_count'].isna().sum()
        df['review_count'] = df['review_count'].apply(parse_review)
        df['review_count'] = np.log1p(df['review_count'])

        logger.info(
            "[ReviewCount] %d missing -> filled with 0",
            missing
        )
        return df

# ── Fix discount ────────────────────────────────

    def fix_discount(self , df : pd.DataFrame)->pd.DataFrame :

        missing = df['discount'].isna().sum()
        df['discount'] = df['discount'].apply(lambda x : re.sub(r'[^\d]','',str(x)))
        df['discount'] = df['discount'].replace('', pd.NA)
        df['discount'] = df['discount'].fillna(0)
        df['discount'] = df['discount'].astype('int')
        
        logger.info(
            "[Discount] %d missing -> 0 | range=%d%%–%d%%",
            missing,
            df['discount'].min(),
            df['discount'].max()
        )
        return df
    
    def fix_rating(self, df: pd.DataFrame) -> pd.DataFrame:
        missing = df['rating'].isna().sum()

        # Flag BEFORE any fill — signal lost forever after
        df['is_new_product'] = df['rating'].isna().astype(int)

        # Group median per aesthetic
        # old_money median != y2k_party median
        df['rating'] = df.groupby('aesthetic')['rating'].transform(
            lambda x: x.fillna(x.median())
        )

        logger.info(
            "[Rating] %d missing -> group median | %d new products flagged",
            missing,
            df['is_new_product'].sum()
        )
        return df
    # ── Main transformation logic ───────────────────────
    def get_data_transformer_data(self):

        # Load validated data
        df = pd.read_csv(self.config.valid_data)
        logger.info("Loaded data | shape=%s", df.shape)

        # Before checks
        logger.info("Price missing BEFORE: %d", df['price'].isna().sum())
        logger.info("Review_count missing BEFORE: %d", df['review_count'].isna().sum())

        # Transform
        df = self.fix_price(df)
        df = self.fix_review_count(df)
        df = self.fix_discount(df)
        df = self.fix_rating(df)

        print(df.isna().sum())

        # After checks (PROOF)
        logger.info("Price missing AFTER: %d", df['price'].isna().sum())
        logger.info("Review_count missing AFTER: %d", df['review_count'].isna().sum())

        # Hard guarantees
        assert df['price'].isna().sum() == 0, "Price still has missing values"
        assert df['review_count'].isna().sum() == 0, "Review count still has missing values"

        
        df_csv_path = os.path.join(self.config.transformed_data,'Transformed_Data.csv')
        df.to_csv(df_csv_path, index=False)
        logger.info("Saved CSV {df_csv_path}%s", self.config.transformed_data)

        # Save PKL — preserves dtypes for next step
        pkl_path = os.path.join(
            self.config.model_path, 'Transformed_Data.pkl'
        )
        pickle.dump(df, open(pkl_path, 'wb'))
        logger.info("Saved PKL -> %s", pkl_path)


        

    # ── Pipeline entry point ────────────────────────────
    def initiate_data_transformation(self):
        logger.info("=" * 20 + " Data Transformation STARTED " + "=" * 20)
        self.get_data_transformer_data()
        logger.info("=" * 20 + " Data Transformation COMPLETED " + "=" * 20)

In [147]:
con = Config_manager()

# get config
data_transformation_config = con.get_data_transformation()

# create transformation object
data_transformation = Data_transformation_check(data_transformation_config)



# transform data
df = data_transformation.initiate_data_transformation()  

[2026-02-27 00:47:17,588: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-02-27 00:47:17,600: INFO: common: created directory at: artifacts]
[2026-02-27 00:47:17,606: INFO: common: created directory at: artifacts/data_transformation/transformed data/]
[2026-02-27 00:47:17,611: INFO: common: created directory at: artifacts/data_transformation/transformed model/]


[2026-02-27 00:47:17,992: INFO: 2790599238: Loaded data | shape=(16364, 9)]
[2026-02-27 00:47:17,997: INFO: 2790599238: Price missing BEFORE: 117]
[2026-02-27 00:47:17,997: INFO: 2790599238: Review_count missing BEFORE: 4022]
[2026-02-27 00:47:18,016: INFO: 2790599238: [Price] 117 missing -> median=788.00]
[2026-02-27 00:47:18,051: INFO: 2790599238: [ReviewCount] 4022 missing -> filled with 0]
[2026-02-27 00:47:18,257: INFO: 2790599238: [Discount] 1037 missing -> 0 | range=0%–94%]
[2026-02-27 00:47:18,297: INFO: 2790599238: [Rating] 4022 missing -> group median | 4022 new products flagged]
asin              0
product_name      0
price             0
rating            0
review_count      0
discount          0
image_url         0
product_link      0
aesthetic         0
is_new_product    0
dtype: int64
[2026-02-27 00:47:18,318: INFO: 2790599238: Price missing AFTER: 0]
[2026-02-27 00:47:18,320: INFO: 2790599238: Review_count missing AFTER: 0]
[2026-02-27 00:47:18,681: INFO: 2790599238: Sav