In [1]:
import os
import pandas as pd
from pathlib import Path
from dataclasses import dataclass

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
os.chdir("../")
%pwd

'c:\\Users\\roshi\\OneDrive\\Desktop\\Git-1\\Text_Classification_ML_NLP'

In [3]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_dir: Path

In [4]:
from src.text_Classification_ML.utils.common import *
from src.text_Classification_ML.constants import *

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_file_path = Config_File_Path,
            schema_file_path = Schema_File_Path
            ):
        self.config = read_yaml(config_file_path)
        self.schema = read_yaml(schema_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
                root_dir= config.root_dir,
                data_dir= config.data_dir
            )

        return data_transformation_config

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from src.text_Classification_ML.logger import logging

In [7]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_split(self):
        data = pd.read_csv(self.config.data_dir)

        # Fill the missing values
        data['Cleaned_Review'].fillna("", inplace=True)

        # Re-Maping the rating volumn
        rating_map = {1: 1, 2: 1, 3: 2, 4: 3, 5: 3}
        data['Rating'] = data['Rating'].map(rating_map)

        # Split the data into train and test sets.
        train, test = train_test_split(data, train_size=0.80)

        # Vectorize text using TF-IDF
        tfidf_vectorizer = TfidfVectorizer()
        train_tfidf = tfidf_vectorizer.fit_transform(train['Cleaned_Review'])
        test_tfidf = tfidf_vectorizer.transform(test['Cleaned_Review'])

        # Save the TF-IDF matrices
        train_tfidf_filename = os.path.join(self.config.root_dir, "train_tfidf.csv")
        test_tfidf_filename = os.path.join(self.config.root_dir, "test_tfidf.csv")
        scipy.sparse.save_npz(train_tfidf_filename, train_tfidf)
        scipy.sparse.save_npz(test_tfidf_filename, test_tfidf)

        # Save the train and test sets without text columns
        train = train.drop(columns=['Cleaned_Review'])
        test = test.drop(columns=['Cleaned_Review'])
        train_filename = os.path.join(self.config.root_dir, "train.csv")
        test_filename = os.path.join(self.config.root_dir, "test.csv")
        train.to_csv(train_filename, index=False)
        test.to_csv(test_filename, index=False)

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config= data_transformation_config)
    data_transformation.train_test_split()
except Exception as e:
    raise e

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Cleaned_Review'].fillna("", inplace=True)
