In [1]:
import os
import pandas as pd
from pathlib import Path
from urllib import request
from dataclasses import dataclass

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
os.chdir("../")
%pwd

'c:\\Users\\roshi\\OneDrive\\Desktop\\Git-1\\Text_Classification_ML_NLP'

In [3]:
@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir         : Path
    data_dir         : Path
    new_data_dir     : Path

In [4]:
from src.text_Classification_ML.utils.common import *
from src.text_Classification_ML.constants import *

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_file_path = Config_File_Path,
            schema_file_path = Schema_File_Path
            ):
        self.config = read_yaml(config_file_path)
        self.schema = read_yaml(schema_file_path)

        create_directories([self.config.artifacts_root])

    def get_data_cleaning_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning

        create_directories([config.root_dir])

        data_cleaning_config = DataCleaningConfig(
            root_dir= config.root_dir,
            data_dir= config.data_dir,
            new_data_dir= config.new_data_dir,
        )

        return data_cleaning_config

In [6]:
import emoji
import contractions
import re
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

nlp = spacy.load("en_core_web_sm")

In [7]:
class DataCleaning:
    def __init__(self, config: DataCleaningConfig):
        self.config = config
        self.stop_words = set(stopwords.words('english'))
        self.df = pd.read_csv(self.config.data_dir)
    
    def clean_text(self):
        self.df['Cleaned_Review'] = self.df['Review'].apply(self._clean_single_text)
        return self.df


    def _clean_single_text(self, text):
        text = emoji.demojize(text)  # fix emojis
        text = contractions.fix(text)  # fix contractions
        text = re.sub(r'[^\x00-\x7f]', r'', text)  # remove strange fonts
        text = re.sub(r"\d+", "number", text)  # replace numbers with "number"
        text = re.sub(r'[^\w\s]', '', text)  # remove non-alphanumeric chars
        text = text.replace('_', ' ')  # replace underscores with space
        text = re.sub(r'[^A-Z a-z 0-9-]+', '', text)
        text = text.strip()  # strip extra spaces
        text = text.lower()
        text = self._remove_accented_chars(text)  # remove accented characters
        text = self._remove_stop_words(text)  # remove stop words

        return text


    def _remove_accented_chars(self, text):
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return text


    def _remove_stop_words(self, text):
        text_tokens = word_tokenize(text)
        filtered_tokens = [word for word in text_tokens if word.lower() not in self.stop_words]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text
    

    def drop_columns_and_duplicates(self):
        self.df.drop(columns=['Time_submitted','Total_thumbsup','Reply','Review'],inplace=True, axis=1)
        return self.df
    

    def word_lemitization(self):
        self.df['Cleaned_Review'] = self.df['Cleaned_Review'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
        return self.df
    
    
    def save_to_csv(self, filename= "data_spotify_new.csv"):
        filepath = os.path.join(self.config.new_data_dir, filename)
        self.df.to_csv(filepath, index=False)

In [8]:
try:
    config = ConfigurationManager()
    data_cleaning_config = config.get_data_cleaning_config()
    data_cleaning = DataCleaning(config=data_cleaning_config)
    data_cleaning.clean_text()
    data_cleaning.drop_columns_and_duplicates()
    data_cleaning.word_lemitization()
    data_cleaning.save_to_csv()
except Exception as e:
    raise e