In [5]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
              


[nltk_data] Downloading package punkt_tab to C:\Users\USER
[nltk_data]     HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
class TextPreprocessor:
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
    
    def expand_contractions(self, text):
        text = text.replace("n't", " not")
        text = text.replace("'re", " are")
        text = text.replace("'s", "")
        return text
    
    def remove_special_chars(self, text):
        return re.sub(r'[^a-zA-Z0-9\. ]', ' ', text)
    
    def tokenize(self, text):
        return word_tokenize(text)
    
    def remove_stopwords(self, tokens):
        return [t for t in tokens if t not in self.stop_words]
    
    def stem(self, tokens):
        return [self.stemmer.stem(t) for t in tokens]
    
    def lemmatize(self, tokens):
        return [self.lemmatizer.lemmatize(t) for t in tokens]
    
    def preprocess(self, text, use_lemmatization=False):
        text = text.lower()
        text = self.expand_contractions(text)
        text = self.remove_special_chars(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stopwords(tokens)
        
        if use_lemmatization:
            tokens = self.lemmatize(tokens)
        else:
            tokens = self.stem(tokens)
            
        return tokens


In [7]:
tp = TextPreprocessor()

sentence = "Apple Inc. shares fell 2.5% amid concerns about iPhone 15 demand."
print(tp.preprocess(sentence, use_lemmatization=True))


['apple', 'inc.', 'share', 'fell', '2.5', 'amid', 'concern', 'iphone', '15', 'demand', '.']
