In [38]:
import re
import string
import pandas as pd
from typing import Iterable
from nltk.corpus import stopwords
# from spellchecker import SpellChecker
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

import spacy
from nltk.stem import WordNetLemmatizer


class Tokenizer:
    def __init__(self) -> None:
        self.tokenizer = RegexpTokenizer("[\w']+")

    def word_tokenizer(self, text) -> Iterable:
        return self.tokenizer.tokenize(text)


class TextCleaner(Tokenizer):

    def __init__(self) -> None:
        super().__init__()

        # the acronyms url
        self._acronyms_url = "https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json"

        # link to data where contractios list is present
        self._contractions_url = "https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json"

        # load the acronym dict
        self._acronyms_dict = self.load_acronym()
        # load acronym list
        self._acronym_list = list(self._acronyms_dict.keys())

        # load the contractions dict
        self._contractions_dict = self.load_contractions()
        # load contractions list
        self._contractions_list = list(self._contractions_dict.keys())

    def load_acronym(self):
        return pd.read_json(self._acronyms_url, typ="series")

    def load_contractions(self):
        return pd.read_json(self._contractions_url, typ="series")

    # Converting to lowercase
    def convert_to_lowercase(self, text):
        return text.lower()

    # remove whitespace from the text
    def remove_whitespace(self, text):
        return text.strip()

    # Removing punctuations from the given string
    def remove_punctuation(self, text):
        # get all the punctuations
        punct_str = string.punctuation

        # the apostrophe will be remove using contraction.
        punct_str = punct_str.replace("'", "")
        return text.translate(str.maketrans("", "", punct_str))

    # Remove any HTML if present in the text.
    def remove_html(self, text):
        html = re.compile(r"<.*?>")
        return html.sub(r"", text)

    # Remove URLs
    def remove_http(self, text):
        http = "https?://\S+|www\.\S+"  # matching strings beginning with http (but not just "http")
        pattern = r"({})".format(http)  # creating pattern
        return re.sub(pattern, "", text)

    # Remove any Emojis present in the text.
    def remove_emoji(self, text):
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "\U00002702-\U000027B0"
            "\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE,
        )
        return emoji_pattern.sub(r"", text)

    def convert_acronyms(self, text):
        words = []
        for word in self.word_tokenizer(text):
            if word in self._acronym_list:
                words = words + self._acronyms_dict[word].split()
            else:
                words = words + word.split()

        text_converted = " ".join(words)
        return text_converted

    def convert_contractions(self, text):
        words = []
        for word in self.word_tokenizer(text):
            if word in self._contractions_list:
                words = words + self._contractions_dict[word].split()
            else:
                words = words + word.split()

        text_converted = " ".join(words)
        return text_converted

    def __call__(self, text):
        text = self.convert_to_lowercase(text=text)
        text = self.remove_whitespace(text=text)
        text = self.remove_punctuation(text=text)
        text = self.remove_html(text=text)
        text = self.remove_http(text=text)
        text = self.remove_emoji(text=text)
        text = self.convert_acronyms(text=text)
        text = self.convert_contractions(text=text)

        return text


prepositions = [
    "about",
    "above",
    "across",
    "after",
    "against",
    "among",
    "around",
    "at",
    "before",
    "behind",
    "below",
    "beside",
    "between",
    "by",
    "down",
    "during",
    "for",
    "from",
    "in",
    "inside",
    "into",
    "near",
    "of",
    "off",
    "on",
    "out",
    "over",
    "through",
    "to",
    "toward",
    "under",
    "up",
    "with",
]
prepositions_less_common = [
    "aboard",
    "along",
    "amid",
    "as",
    "beneath",
    "beyond",
    "but",
    "concerning",
    "considering",
    "despite",
    "except",
    "following",
    "like",
    "minus",
    "onto",
    "outside",
    "per",
    "plus",
    "regarding",
    "round",
    "since",
    "than",
    "till",
    "underneath",
    "unlike",
    "until",
    "upon",
    "versus",
    "via",
    "within",
    "without",
]
coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
correlative_conjunctions = [
    "both",
    "and",
    "either",
    "or",
    "neither",
    "nor",
    "not",
    "only",
    "but",
    "whether",
    "or",
]
subordinating_conjunctions = [
    "after",
    "although",
    "as",
    "as if",
    "as long as",
    "as much as",
    "as soon as",
    "as though",
    "because",
    "before",
    "by the time",
    "even if",
    "even though",
    "if",
    "in order that",
    "in case",
    "in the event that",
    "lest",
    "now that",
    "once",
    "only",
    "only if",
    "provided that",
    "since",
    "so",
    "supposing",
    "that",
    "than",
    "though",
    "till",
    "unless",
    "until",
    "when",
    "whenever",
    "where",
    "whereas",
    "wherever",
    "whether or not",
    "while",
]


class TextPreprocess(Tokenizer):

    def __init__(self) -> None:
        super().__init__()
        # initialize the stop words
        self.stopwords = stopwords.words("english")

        # update stop words
        self.stopwords = self.stopwords + [
            "among",
            "onto",
            "shall",
            "thrice",
            "thus",
            "twice",
            "unto",
            "us",
            "would",
        ]

        # spell checker
        self.spell = None

        # stemmer object
        self.stemmer = PorterStemmer()

        # spacy lemittizer object
        self.spacy_lemmatizer = spacy.load("en_core_web_sm", disable=["parser", "ner"])

        # remove additional stop words
        self.additioanal_stop_words = (
            prepositions
            + prepositions_less_common
            + coordinating_conjunctions
            + correlative_conjunctions
        )

    def remove_stopwords(self, text):
        return " ".join(
            [word for word in self.word_tokenizer(text) if word not in self.stopwords]
        )

    def pyspellchecker(self, text):
        word_list = self.word_tokenizer(text)
        word_list_corrected = []
        for word in word_list:
            if word in self.spell.unknown(word_list):
                word_corrected = self.spell.correction(word)
                if word_corrected == None:
                    word_list_corrected.append(word)
                else:
                    word_list_corrected.append(word_corrected)
            else:
                word_list_corrected.append(word)
        text_corrected = " ".join(word_list_corrected)
        return text_corrected

    def porter_stemmer(self, text):
        text_stem = " ".join(
            [self.stemmer.stem(word) for word in self.word_tokenizer(text)]
        )
        return text_stem

    def lemmatizer(self, text):
        text_spacy = " ".join([token.lemma_ for token in self.spacy_lemmatizer(text)])
        return text_spacy

    def remove_additional_stopwords(self, text):
        return " ".join(
            [
                word
                for word in self.word_tokenizer(text)
                if word not in self.additioanal_stop_words
            ]
        )

    def __call__(self, text):

        text = self.remove_stopwords(text=text)
        # text = self.pyspellchecker(text=text)
        text = self.porter_stemmer(text=text)
        text = self.lemmatizer(text=text)
        text = self.remove_additional_stopwords(text=text)

        return text




  self.tokenizer = RegexpTokenizer("[\w']+")
  http = "https?://\S+|www\.\S+"  # matching strings beginning with http (but not just "http")


In [39]:
txt = TextCleaner()

IndentationError: unexpected indent (3723369933.py, line 2)

In [56]:
input_sentence = "BTW the weather isn't ideal in <b>Toronto</b> 😊. isn't it?. Read this news for more info www.google.com/search?q=weather+in+toronto"

In [57]:

print(f"Input : {input_sentence}")
input_sentence = txt.convert_to_lowercase(input_sentence)
print(f"Output : {input_sentence}")


Input : BTW the weather isn't ideal in <b>Toronto</b> 😊. isn't it?. Read this news for more info www.google.com/search?q=weather+in+toronto
Output : btw the weather isn't ideal in <b>toronto</b> 😊. isn't it?. read this news for more info www.google.com/search?q=weather+in+toronto


In [58]:

print(f"Input : {input_sentence}")
input_sentence = txt.remove_whitespace(input_sentence)
print(f"Output : {input_sentence}")

Input : btw the weather isn't ideal in <b>toronto</b> 😊. isn't it?. read this news for more info www.google.com/search?q=weather+in+toronto
Output : btw the weather isn't ideal in <b>toronto</b> 😊. isn't it?. read this news for more info www.google.com/search?q=weather+in+toronto


In [59]:

print(f"Input : {input_sentence}")
input_sentence = txt.remove_html(input_sentence)
print(f"Output : {input_sentence}")

Input : btw the weather isn't ideal in <b>toronto</b> 😊. isn't it?. read this news for more info www.google.com/search?q=weather+in+toronto
Output : btw the weather isn't ideal in toronto 😊. isn't it?. read this news for more info www.google.com/search?q=weather+in+toronto


In [60]:

print(f"Input : {input_sentence}")
input_sentence = txt.remove_http(input_sentence)
print(f"Output : {input_sentence}")

Input : btw the weather isn't ideal in toronto 😊. isn't it?. read this news for more info www.google.com/search?q=weather+in+toronto
Output : btw the weather isn't ideal in toronto 😊. isn't it?. read this news for more info 


In [61]:

print(f"Input : {input_sentence}")
input_sentence = txt.remove_punctuation(input_sentence)
print(f"Output : {input_sentence}")

Input : btw the weather isn't ideal in toronto 😊. isn't it?. read this news for more info 
Output : btw the weather isn't ideal in toronto 😊 isn't it read this news for more info 


In [62]:

print(f"Input : {input_sentence}")
input_sentence = txt.remove_emoji(input_sentence)
print(f"Output : {input_sentence}")

Input : btw the weather isn't ideal in toronto 😊 isn't it read this news for more info 
Output : btw the weather isn't ideal in toronto  isn't it read this news for more info 


In [63]:

print(f"Input : {input_sentence}")
input_sentence = txt.convert_acronyms(input_sentence)
print(f"Output : {input_sentence}")

Input : btw the weather isn't ideal in toronto  isn't it read this news for more info 
Output : by the way the weather isn't ideal in toronto isn't it read this news for more info


In [64]:
print(f"Input : {input_sentence}")
input_sentence = txt.convert_contractions(input_sentence)
print(f"Output : {input_sentence}")

Input : by the way the weather isn't ideal in toronto isn't it read this news for more info
Output : by the way the weather is not ideal in toronto is not it read this news for more info


In [50]:
## class Text Preprocess
tp = TextPreprocess()

In [65]:
print(f"Input : {input_sentence}")
input_sentence = tp.remove_stopwords(input_sentence)
print(f"Output : {input_sentence}")

Input : by the way the weather is not ideal in toronto is not it read this news for more info
Output : way weather ideal toronto read news info


In [67]:
input_sentence = "The temperature is more hotter than it used to be. How are you planning to deal with rising universal temperature."

print(f"Input : {input_sentence}")
input_sentence = tp.porter_stemmer(input_sentence)
print(f"Output : {input_sentence}")

Input : The temperature is more hotter than it used to be. How are you planning to deal with rising universal temperature.
Output : the temperatur is more hotter than it use to be how are you plan to deal with rise univers temperatur


In [68]:
print(f"Input : {input_sentence}")
input_sentence = tp.lemmatizer(input_sentence)
print(f"Output : {input_sentence}")

Input : the temperatur is more hotter than it use to be how are you plan to deal with rise univers temperatur
Output : the temperatur be more hot than it use to be how be you plan to deal with rise univer temperatur


In [69]:
print(f"Input : {input_sentence}")
input_sentence = tp.remove_additional_stopwords(input_sentence)
print(f"Output : {input_sentence}")

Input : the temperatur be more hot than it use to be how be you plan to deal with rise univer temperatur
Output : the temperatur be more hot it use be how be you plan deal rise univer temperatur
