In [1]:
import os

In [2]:
%pwd

'd:\\Hate-Speech-Classifier\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Hate-Speech-Classifier'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from textClassification.constants import *
from textClassification.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config


In [8]:
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\SURESH
[nltk_data]     BEEKHANI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.stemmer = nltk.SnowballStemmer("english")
        self.stopword = set(stopwords.words('english'))

    
    # Let's apply regex and do cleaning.
    def data_cleaning(self,words):
        words = str(words).lower()
        words = re.sub('\[.*?\]', '', words)
        words = re.sub('https?://\S+|www\.\S+', '', words)
        words = re.sub('<.*?>+', '', words)
        words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
        words = re.sub('\n', '', words)
        words = re.sub('\w*\d\w*', '', words)
        words = [word for word in words.split(' ') if words not in self.stopword]
        words=" ".join(words)
        words = [self.stemmer.stem(words) for word in words.split(' ')]
        words=" ".join(words)

        return words
    

    def clean_and_transform(self):
        imbalance_data = pd.read_csv(os.path.join(self.config.data_path,"imbalanced_data.csv"))
        imbalance_data.drop('id', axis=1, inplace=True)

        raw_data = pd.read_csv(os.path.join(self.config.data_path,"raw_data.csv"))
        raw_data.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither'], axis=1, inplace=True)
        raw_data[raw_data['class'] == 0]["class"]=1
        raw_data["class"].replace({0:1},inplace=True)
        raw_data["class"].replace({2:0}, inplace = True)
        raw_data.rename(columns={'class':'label'},inplace =True)

        frame = [imbalance_data, raw_data]
        df = pd.concat(frame)


        df['tweet']=df['tweet'].apply(self.data_cleaning)

        df.to_csv(os.path.join(self.config.root_dir,'main_df.csv'), index=False)


In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.clean_and_transform()
except Exception as e:
    raise e

[2025-01-05 17:44:11,982: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-01-05 17:44:11,994: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-05 17:44:11,998: INFO: common: created directory at: artifacts]
[2025-01-05 17:44:11,998: INFO: common: created directory at: artifacts/data_transformation]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data[raw_data['class'] == 0]["class"]=1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data["class"].replace({0:1},inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value