In [1]:
import os


In [2]:
%pwd
os.chdir("../")

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ProcessedDataConfigure:
    root_dir: Path
    train_path: Path
    test_path: Path
    data_path: Path
    vectorizer_path: Path
    test_size: int
    random_state: int
    target: str
    cols_not_use: list
    ngram: tuple
    max_features: int
    min_df: int



In [4]:
import os
from Project_2.constants import *
from Project_2.utils.common import read_yaml, create_directories
from pathlib import Path

class ConfigureManager:
    def __init__(self,
                config_filepath: Path = CONFIG_FILE_PATH,
                params_filepath: Path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_processed_data_config(self) -> ProcessedDataConfigure:
        processed = self.config.data_process
        params = self.params
        create_directories([processed.root_dir])

        processed_data_config = ProcessedDataConfigure(
            root_dir = Path(processed.root_dir),
            train_path = Path(processed.train_path),
            test_path = Path(processed.test_path),
            data_path = Path(processed.data_path),
            vectorizer_path = Path(processed.vectorizer_path),
            test_size = params.TEST_SIZE,
            random_state= params.RANDOM_STATE,
            target= params.TARGET,
            cols_not_use = params.COLS_NOT_USE,
            ngram = tuple(params.NGRAM),
            max_features = params.MAX_FEATURES,
            min_df = params.MIN_DF,
        )
        return processed_data_config



In [None]:
import pandas as pd
from pyvi import ViTokenizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

class ProcessedData:
    def __init__(self, config = ProcessedDataConfigure):
        self.config = config
        self.stopwords = {
                            'thì', 'là', 'mà', 'và', 'của', 'những', 'các', 'cái', 'việc', 'bị', 'bởi',
                            'shop', 'sản_phẩm', 'hàng', 'giao', 'mua', 'bán', 'mình', 'tiki', 'shopee',
                            'nhé', 'nha', 'ạ', 'ơi', 'nhen', 'kaka', 'hihi'
                        }

        self.teencode = {
                        'k': 'không', 'ko': 'không', 'kh': 'không', 'hok': 'không',
                        'dc': 'được', 'đc': 'được',
                        'bt': 'bình thường',
                        'wa': 'quá',
                        'uk': 'ừ',
                        'z': 'vậy',
                        'sp': 'sản phẩm'
                    }

        self.vectorizer = TfidfVectorizer(
                                            ngram_range=self.config.ngram,
                                            max_features=self.config.max_features,
                                            min_df=self.config.min_df # Bỏ qua những từ xuất hiện quá ít (dưới 2 lần)
                                        )

    def load_data(self) -> pd.DataFrame:
        try:
            path = self.config.data_path
            data = pd.read_csv(path)
            print("Load data completed!")
            return data
        except Exception as e:
            print(f"Error: {e}")
            raise e

    # Droip columns not use in training. You can change cols in params
    def drop_cols_not_use(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            data = data.drop(self.config.cols_not_use, axis=1)
            print("Drop columns not use in training !")
            return data
        except Exception as e:
            print(f"Error: {e}")
            raise e

    # clean text
    def clean_text(self, text : str) -> None:
        # lower
        text = text.lower()

        # remove emoji
        emoji_pattern  = re.compile(
                                    u"[\U0001F600-\U0001F64F"  # Emoticons (Mặt cười...)
                                    u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                                    u"\U0001F680-\U0001F6FF"  # Transport & Map
                                    u"\U0001F900-\U0001F9FF"  # Supplemental Symbols (Các emoji mới)
                                    u"\u2600-\u26FF"          # Misc Symbols (Dấu tim, quân bài...)
                                    u"\u2700-\u27BF"          # Dingbats
                                    u"\u200D\uFE0F]+"         # Zero-width joiner (cho các emoji ghép)
                                )

        text = emoji_pattern.sub(r'', text)
        # remove special text
        text = re.sub(r'[^\w\s,.!?]', '', text)
        text = re.sub(r'[.,!?;:]', '', text)

        # processed teen code
        words = text.split()
        words = [self.teencode.get(word, word) for word in words]
        text = ' '.join(words)

        text_tokenized = ViTokenizer.tokenize(text)

        tokens = text_tokenized.split()
        clean_tokens = [t for t in tokens if t not in self.stopwords]

        return ' '.join(clean_tokens)

    def split_data(self, data: pd.DataFrame) -> tuple:
        try:
            X = data["content"].tolist()
            y = data[self.config.target].tolist()

            X_train, X_test, y_train, y_test = train_test_split(
                                                                X, y,
                                                                test_size=self.config.test_size,
                                                                random_state=self.config.random_state,
                                                                stratify=y
                                                            )
            print("Split data completed!")
            return X_train, X_test, y_train, y_test
        except Exception as e:
            print(f"Error: {e}")
            raise e

    def save_data(self, X_train, X_test, y_train, y_test) -> None:
        try:
            train_data = pd.DataFrame({ "content": X_train, self.config.target: y_train })
            test_data = pd.DataFrame({ "content": X_test, self.config.target: y_test })

            train_data.to_csv(self.config.train_path, index=False)
            test_data.to_csv(self.config.test_path, index=False)

            print(f"Save data completed! \nTrain path: {self.config.train_path} \nTest path: {self.config.test_path}")
        except Exception as e:
            print(f"Error: {e}")
            raise e

        # process data
    def process_data(self, data: pd.DataFrame) -> None:
        try:
            # drop cols
            data = self.drop_cols_not_use(data)

            # check null
            print(f"Check Null in data : {data.isnull().sum()}")
            data = data.dropna()

            # check duplicated
            print(f"Check Duplicated in data : {data.duplicated().sum()}")
            data = data.drop_duplicates()

            # clean text
            data["content"] = data["content"].apply(self.clean_text)
            print("Process data completed!")

            X_train, X_test, y_train, y_test = self.split_data(data)
            self.save_data(X_train, X_test, y_train, y_test)


        except Exception as e:
            print(f"Error: {e}")
            raise e


In [8]:
try:
    config_manager = ConfigureManager()
    processed_data_config = config_manager.get_processed_data_config()

    processed_data = ProcessedData(config=processed_data_config)
    data = processed_data.load_data()
    processed_data.process_data(data)
except Exception as e:
    print(f"Error in stage_01_processed_data: {e}")

[2026-02-02 01:04:34,048: INFO: common: yaml file: configs\config.yaml loaded successfully]
[2026-02-02 01:04:34,054: INFO: common: yaml file: params.yaml loaded successfully]
[2026-02-02 01:04:34,056: INFO: common: created directory at: artifacts]
[2026-02-02 01:04:34,059: INFO: common: created directory at: artifacts/data_process]
Load data completed!
Drop columns not use in training !
Check Null in data : rating         0
content    18010
dtype: int64
Check Duplicated in data : 1768
Process data completed!
Split data completed!
Save data completed! 
Train path: artifacts\data_process\train.csv 
Test path: artifacts\data_process\test.csv
