In [1]:
import os

In [2]:
%pwd

'/home/lazada-id-reviews/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/lazada-id-reviews'

### Data Preprocessing Config

This code will be apply in `src/LazadaReviews/entity/config_entity.py`

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    reviews_path: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    params_test_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    vectorized_train_path: Path
    vectorized_test_path: Path
    model_dir: Path
    vectorizer_model_path: Path

### Data Preprocessing Config Manager

This code will be apply in `src/LazadaReviews/config/configurations.py`.

What we would do?
+ Drop null values
+ Splitting the dataset to train and test data
+ Vectorize text using `TFIDF`

As stated before; let’s load, select columns, and drop null values from dataset.

In [6]:
from LazadaReviews.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LazadaReviews.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        data_ingest_config = self.config.ingest_from_sql
        data_dump_config = self.config.dump_data
        dataset_params = self.params

        create_directories([data_dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=data_dump_config.root_dir,
            reviews_path=data_ingest_config.reviews_path,
            input_train_path=data_dump_config.input_train_path,
            input_test_path=data_dump_config.input_test_path,
            output_train_path=data_dump_config.output_train_path,
            output_test_path=data_dump_config.output_test_path,
            params_test_size=dataset_params.TEST_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        data_dump_config = self.config.dump_data
        vectorize_config = self.config.vectorize_data
        train_config = self.config.train_model

        create_directories([vectorize_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=vectorize_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            input_test_path=Path(data_dump_config.input_test_path),
            vectorized_train_path=Path(vectorize_config.vectorized_train_path),
            vectorized_test_path=Path(vectorize_config.vectorized_test_path),
            model_dir=train_config.root_dir,
            vectorizer_model_path=Path(vectorize_config.vectorizer_model_path)
        )

        return config

### Perform Preprocessing

This code in `src/LazadaReviews/components/preprocessing.py`.

In [8]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from LazadaReviews import logger

class DumpData:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        """dump the splited dataset to data training and testing
        """
        logger.info(f"Read reviews file.")
        dataset_reviews = pd.read_csv(self.config.reviews_path)
        dataset = dataset_reviews[["rating", "reviewContent"]].copy()
        dataset.dropna(inplace=True)
        
        logger.info(f"Split reviews file to data train and test.")
        X_train, X_test, y_train, y_test = train_test_split(
            dataset["reviewContent"], 
            dataset["rating"], 
            test_size=self.config.params_test_size
        )
        
        logger.info(f"Dump data train into {self.config.input_train_path} directory.")
        X_train.to_pickle(self.config.input_train_path)
        X_test.to_pickle(self.config.input_test_path)
        
        logger.info(f"Dump data test into {self.config.input_test_path} directory.")
        y_train.to_pickle(self.config.output_train_path)
        y_test.to_pickle(self.config.output_test_path)

class Preprocessing:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config

    def vectorize_data(self) -> None:
        """vectorize the splited dataset and dump vectorizer model
        """
        vectorizer = TfidfVectorizer()
        
        logger.info(f"Load data train in {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        
        logger.info(f"Load data test in {self.config.input_test_path}.")
        X_test = joblib.load(self.config.input_test_path)
        
        logger.info(f"Vectorize the data.")
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)
        
        logger.info(f"Dump the vectorized data.")
        joblib.dump(X_train_vec, self.config.vectorized_train_path)
        joblib.dump(X_test_vec, self.config.vectorized_test_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Save the vectorizer model.")
        joblib.dump(vectorizer, self.config.vectorizer_model_path)

### Dump the Data Train and Data Test

This code in `src/LazadaReviews/pipeline/step_02_preprocessing.py`.

In [9]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = DumpData(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-13 17:31:55,457: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-13 17:31:55,470: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-13 17:31:55,474: INFO: common: created directory at: artifacts]
[2024-07-13 17:31:55,476: INFO: common: created directory at: artifacts/data]
[2024-07-13 17:31:55,478: INFO: 3644560127: Read reviews file.]
[2024-07-13 17:31:56,341: INFO: 3644560127: Split reviews file to data train and test.]
[2024-07-13 17:31:56,363: INFO: 3644560127: Dump data train into artifacts/data/X_train.pkl directory.]
[2024-07-13 17:31:56,464: INFO: 3644560127: Dump data test into artifacts/data/X_test.pkl directory.]


**Debug**: Read data

In [10]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

149861    Barang sudah diterima dengan baik di kalbar, t...
20792     Kemarin bayar hari ini terima barangnya....sen...
132477    Pengiriman ny sngt cpat tgl 20 pesan tgl 21 ja...
178825    Barang nya sudah sampai...lumayan sih.. Thanks...
118600                 Pengiriman cepat, Packing Aman, Rapi
                                ...                        
77806                          barang bagus dan kualitas ok
47629          not bad ,, rentan virus tpi sesuai harga lah
78578     alhamdulillah barang sampe meskipun terlambat ...
104960    laptopnya bagus cepet datengnya packing nya ju...
4565        sesuai pesanan next bakal order lagi disini ..😍
Name: reviewContent, Length: 21405, dtype: object

In [11]:
X_train.isnull().sum()

0

In [12]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

149861    5
20792     5
132477    5
178825    4
118600    5
         ..
77806     5
47629     5
78578     4
104960    5
4565      5
Name: rating, Length: 21405, dtype: int64

In [13]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

84996     laptop aman..sempurna..joss,,,pekingan kayu......
61289                                         Zesuai gambaR
48653                               pengirimanx lama banget
23608                               sesuai dentan deskripsi
133641        Alhamdulillah pesanan yg kedua original lagi.
                                ...                        
200630    barang sudah keterima tapi setelah 2 hari remo...
193741            sesuai dengan gambar, pengiriman ok cepat
129356    Baru pesen hari selasa pagi eh rabu sore udang...
29520     Barang sesuai, Packing Rapih. Tingkatkan lagi ...
77558     Bagus. Pelayanannya Ramah memuaskan bang nya s...
Name: reviewContent, Length: 85624, dtype: object

In [14]:
X_test.isnull().sum()

0

In [15]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

84996     5
61289     5
48653     3
23608     5
133641    5
         ..
200630    5
193741    5
129356    4
29520     5
77558     5
Name: rating, Length: 85624, dtype: int64

### Vectorize the Data Train and Data Test

This code in `src/LazadaReviews/pipeline/step_02_preprocessing.py`.

In [16]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.vectorize_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-13 17:31:56,820: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-13 17:31:56,828: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-13 17:31:56,830: INFO: common: created directory at: artifacts]
[2024-07-13 17:31:56,832: INFO: common: created directory at: artifacts/preprocessing]
[2024-07-13 17:31:56,835: INFO: 3644560127: Load data train in artifacts/data/X_train.pkl.]
[2024-07-13 17:31:56,876: INFO: 3644560127: Load data test in artifacts/data/X_test.pkl.]
[2024-07-13 17:31:57,056: INFO: 3644560127: Vectorize the data.]
[2024-07-13 17:31:58,171: INFO: 3644560127: Dump the vectorized data.]
[2024-07-13 17:31:58,227: INFO: 3644560127: Creating artifacts/models directory.]
[2024-07-13 17:31:58,228: INFO: 3644560127: Save the vectorizer model.]


**Debug**: Read data

In [17]:
X_train_vec = joblib.load(preprocessing_config.vectorized_train_path)
X_train_vec

<21405x13849 sparse matrix of type '<class 'numpy.float64'>'
	with 252757 stored elements in Compressed Sparse Row format>

In [18]:
X_test_vec = joblib.load(preprocessing_config.vectorized_test_path)
X_test_vec

<85624x13849 sparse matrix of type '<class 'numpy.float64'>'
	with 983950 stored elements in Compressed Sparse Row format>