In [None]:
%pip install nltk

In [None]:
# Download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Added to resolve LookupError

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

class TextPreprocessor:

    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        # A basic dictionary for contractions, can be expanded
        self.contractions_map = {
            "don't": "do not",
            "won't": "will not",
            "can't": "cannot",
            "it's": "it is",
            "i'm": "i am",
            "he's": "he is",
            "she's": "she is",
            "we're": "we are",
            "they're": "they are",
            "you're": "you are",
            "i've": "i have",
            "we've": "we have",
            "you've": "you have",
            "they've": "they have",
            "i'd": "i would",
            "we'd": "we would",
            "you'd": "you would",
            "they'd": "they would",
            "isn't": "is not",
            "aren't": "are not",
            "wasn't": "was not",
            "weren't": "were not",
            "haven't": "have not",
            "hasn't": "has not",
            "hadn't": "had not",
            "wouldn't": "would not",
            "don't": "do not",
            "doesn't": "does not",
            "didn't": "did not",
            "shouldn't": "should not",
            "couldn't": "could not",
            "mustn't": "must not",
            "mightn't": "might not"
        }

    def expand_contractions(self, text):
        def replace(match):
            return self.contractions_map.get(match.group(0).lower(), match.group(0))
        # Use regex to find contractions and replace them
        return re.sub(r"\b(?:" + "|".join(re.escape(k) for k in self.contractions_map.keys()) + r")\b", replace, text, flags=re.IGNORECASE)

    def remove_special_chars(self, text):
        # Keep alphanumeric characters, spaces, and decimal points
        return re.sub(r'[^a-zA-Z0-9\s.]', '', text)

    def tokenize(self, text):
        # Convert text to lowercase and tokenize
        return word_tokenize(text.lower())

    def remove_stopwords(self, tokens):
        return [word for word in tokens if word not in self.stop_words]

    def stem(self, tokens):
        return [self.stemmer.stem(word) for word in tokens]

    def lemmatize(self, tokens):
        return [self.lemmatizer.lemmatize(word) for word in tokens]

    def preprocess(self, text, use_lemmatization=False):
        text = self.expand_contractions(text)
        text = self.remove_special_chars(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stopwords(tokens)
        if use_lemmatization:
            tokens = self.lemmatize(tokens)
        else:
            tokens = self.stem(tokens)

        # Correctly remove trailing periods by creating a new list
        processed_tokens = []
        for token in tokens:
            if token and token[-1] == ".": # Check if token is not empty and has a trailing period
                processed_tokens.append(token[0:-1])
            else:
                processed_tokens.append(token)
        for i in processed_tokens:
          if i==" " or i=="":
              processed_tokens.remove(i)
        return processed_tokens

In [None]:
preprocessor = TextPreprocessor()
c = preprocessor.preprocess("Apple Inc. shares fell 2.5% amid concerns about iPhone 15 demand.", use_lemmatization=True)
print(c)

['apple', 'inc', 'share', 'fell', '2.5', 'amid', 'concern', 'iphone', '15', 'demand']
