In [1]:
%pwd

'c:\\Users\\sam\\End-to-End-Fashion-Recommendation-System-with-MLOps\\research'

In [2]:
import os
os.chdir('../')

In [58]:
%pwd

'c:\\Users\\sam\\End-to-End-Fashion-Recommendation-System-with-MLOps'

In [59]:
import pandas as pd 

In [93]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Data_transformation:
    valid_data : Path
    transformed_data: Path



In [89]:
from src.recommendation_system.utils.common import read_yaml , create_dir
from src.recommendation_system.logging import logger
from src.recommendation_system.constants import CONFIG_PATH
from src.recommendation_system.logging import logger

In [94]:
class Config_manager:

    def __init__(self, config_path=CONFIG_PATH):
        self.config = read_yaml(config_path)
        create_dir([self.config.artifacts_root])

    def get_data_transformation(self) -> Data_transformation:
        config = self.config.data_transformation
        create_dir([config.transformed_data])

        data_transformation_config = Data_transformation(
            valid_data = config.valid_data,
            transformed_data=config.transformed_data
        )

        return data_transformation_config

In [None]:
import pandas as pd
import numpy as np

class Data_transformation_check:

    def __init__(self, config):
        # config should contain paths like:
        # config.valid_data
        # config.transformed_data
        self.config = config

    # ── Fix price missing values ─────────────────────────
    def fix_price(self, df: pd.DataFrame) -> pd.DataFrame:
        missing = df['price'].isna().sum()
        median_val = df['price'].median()

        df['price'] = df['price'].fillna(median_val)

        logger.info(
            "[Price] %d missing -> median=%.2f",
            missing,
            median_val
        )
        return df

    # ── Fix review_count ────────────────────────────────
    def fix_review_count(self, df: pd.DataFrame) -> pd.DataFrame:

        def parse_review(x):
            if pd.isna(x):
                return 0

            x = str(x).lower().strip()
            try:
                if 'k' in x:
                    return int(float(x.replace('k', '')) * 1000)
                elif 'l' in x:
                    return int(float(x.replace('l', '')) * 100000)
                else:
                    return int(float(x))
            except ValueError:
                return 0

        missing = df['review_count'].isna().sum()
        df['review_count'] = df['review_count'].apply(parse_review)

        logger.info(
            "[ReviewCount] %d missing -> filled with 0",
            missing
        )
        return df

# ── Fix discount ────────────────────────────────

    def fix_discount(self , df : pd.DataFrame)->pd.DataFrame :

        df['discount'] = df['discount'].apply(lambda x : re.sub(r'[^\d]','',str(x)))

    # ── Main transformation logic ───────────────────────
    def get_data_transformer_data(self):

        # Load validated data
        df = pd.read_csv(self.config.valid_data)
        logger.info("Loaded data | shape=%s", df.shape)

        # Before checks
        logger.info("Price missing BEFORE: %d", df['price'].isna().sum())
        logger.info("Review_count missing BEFORE: %d", df['review_count'].isna().sum())

        # Transform
        df = self.fix_price(df)
        df = self.fix_review_count(df)

        print(df.isna().sum())

        # After checks (PROOF)
        logger.info("Price missing AFTER: %d", df['price'].isna().sum())
        logger.info("Review_count missing AFTER: %d", df['review_count'].isna().sum())

        # Hard guarantees
        assert df['price'].isna().sum() == 0, "Price still has missing values"
        assert df['review_count'].isna().sum() == 0, "Review count still has missing values"

        
        

    # ── Pipeline entry point ────────────────────────────
    def initiate_data_transformation(self):
        logger.info("=" * 20 + " Data Transformation STARTED " + "=" * 20)
        self.get_data_transformer_data()
        logger.info("=" * 20 + " Data Transformation COMPLETED " + "=" * 20)

In [115]:
con = Config_manager()

# get config
data_transformation_config = con.get_data_transformation()

# create transformation object
data_transformation = Data_transformation_check(data_transformation_config)

# read data ONCE
df = pd.read_csv(
    r"C:\Users\sam\End-to-End-Fashion-Recommendation-System-with-MLOps\artifacts\data_validation\validated\Master_data.csv"
)

# transform data
df = data_transformation.initiate_data_transformation()  

[2026-02-26 23:15:21,299: INFO: common: yaml file: config\config.yaml loaded successfully]
[2026-02-26 23:15:21,301: INFO: common: created directory at: artifacts]
[2026-02-26 23:15:21,303: INFO: common: created directory at: artifacts/data_transformation/transformed/]
[2026-02-26 23:15:21,513: INFO: 549047326: Loaded data | shape=(16364, 9)]
[2026-02-26 23:15:21,513: INFO: 549047326: Price missing BEFORE: 117]
[2026-02-26 23:15:21,513: INFO: 549047326: Review_count missing BEFORE: 4022]
[2026-02-26 23:15:21,524: INFO: 549047326: [Price] 117 missing -> median=788.00]
[2026-02-26 23:15:21,529: INFO: 549047326: [ReviewCount] 4022 missing -> filled with 0]
asin               0
product_name       0
price              0
rating          4022
review_count       0
discount        1037
image_url          0
product_link       0
aesthetic          0
dtype: int64
[2026-02-26 23:15:21,560: INFO: 549047326: Price missing AFTER: 0]
[2026-02-26 23:15:21,562: INFO: 549047326: Review_count missing AFTER