In [13]:
# Function to perform lemmatization and remove stopwords
def lemmatize_text(text_series):
    """
    Perform lemmatization on the input text and remove common English stopwords.

    Parameters:
    text (str): The input text to be lemmatized.

    Returns:
    str: The lemmatized text with stopwords removed.
    """
    import pandas as pd
    import nltk
    from wordcloud import WordCloud
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize.regexp import RegexpTokenizer

    # Download NLTK resources
    if not nltk.corpus.stopwords.fileids():
        nltk.download('punkt')
        nltk.download('stopwords')
        

    # Initialize the lemmatizer and stopwords set
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'[\w]+')

    lemmed_cells = []

    for i in text_series:
        
        #tokenize series to lower case
        tokens = tokenizer.tokenize(i.lower())

        # remove stop words
        stopped = []
        for token in tokens:
            if token not in stop_words:
                stopped.append(token)

        lemmed = ''
        for word in stopped:
            lemmed_word = lemmatizer.lemmatize(word)
            lemmed += lemmed_word + ' '

        lemmed.strip()

        lemmed_cells.append(lemmed)

    lemmed_series = pd.Series(lemmed_cells)

    return lemmed_series
        



In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Scratticus\AppData\Roaming\nltk_data...


True

In [16]:
import pandas as pd

test1 = pd.Series({
    0: "My favourite thing about Harry Potter is hermione Granger 5 stars!",
    1: "I love lord of the rings, but this movie didn't do the books justice",
    2: "I wish I had never seen looper, what a load of shit",
    3: "I like this, good times"
    })

test1_return = lemmatize_text(test1)

print(test1_return)

0    favourite thing harry potter hermione granger ...
1                   love lord ring movie book justice 
2                    wish never seen looper load shit 
3                                      like good time 
dtype: object


In [15]:
test2_data = ({
    'test_text': [
        "My favorite book is 'Do android's dream of Electric Sheep'.",
        "I'll arrive between 1.00pm and 4.30pm.",
        "These are special characters #%^$@*&$#&^$",
        "Don't worry, be happy!",
        "This is just - an example, sentence with some; punctuation.",
        "12345 is a number.",
        "It's a beautiful day.",
        ]
    })

test2 = pd.DataFrame(test2_data)

test2_return = lemmatize_text(test2['test_text'])

print(test2_return)

0    favorite book android dream electric sheep 
1                          arrive 1 00pm 4 30pm 
2                             special character 
3                                   worry happy 
4                  example sentence punctuation 
5                                  12345 number 
6                                 beautiful day 
dtype: object
