In [2]:
def column_stemmatizer(text_series):
    """
    This function preprocesses a pandas Series of sentences, typically taken from a dataframe column and prepares them for classification/regression modelling
    by tokenizing, removing stop words, and stemmatizing the series.
    
    Args:
    text_series (pd.Series): Input pandas Series containing sentences.
    
    Returns:
    pd.Series: Processed Series containing stemmed words.

    Example:
    df['to_be_stemmed'] = pd.Series({0: 'I like this', 1: 'good times'})
    df['stems'] = column_stemmatizer(df['to_be_stemmed'])

    returns:
    df['stems'] = pd.Series({0: ['like', 'this'], 1: ['good', 'time']})
    """
    import pandas as pd
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem.snowball import EnglishStemmer
    from nltk.tokenize.regexp import RegexpTokenizer

    # Download NLTK resources
    if not nltk.corpus.stopwords.fileids():
        nltk.download('punkt')
        nltk.download('stopwords')

    # Initialize stemmer, stopwords and regex tokenizer
    stemmer = EnglishStemmer()
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'[\w]+')

    # prepare list to hold returned data
    stemmed_cells = []

    # Loop through columns:
    for i in text_series:

        # Tokenize series and set lower case
        tokens = tokenizer.tokenize(i.lower())

        # remove stopwords
        stopped = []
        for token in tokens:
            if token not in stop_words:
                stopped.append(token)
        
        stemmed = ''
        for word in stopped:
            stemmed_word = stemmer.stem(word)
            stemmed += stemmed_word + ' '
        
        stemmed.strip()

        stemmed_cells.append(stemmed)

    stemmed_series = pd.Series(stemmed_cells)

    return stemmed_series

In [3]:
import pandas as pd

test1 = pd.Series({
    0: "My favourite thing about Harry Potter is hermione Granger 5 stars!",
    1: "I love lord of the rings, but this movie didn't do the books justice",
    2: "I wish I had never seen looper, what a load of shit",
    3: "I like this, good times"
    })

test1_return = column_stemmatizer(test1)

print(test1_return)

0    favourit thing harri potter hermion granger 5 ...
1                     love lord ring movi book justic 
2                    wish never seen looper load shit 
3                                      like good time 
dtype: object


In [7]:
test2_data = ({
    'test_text': [
        "My favorite book is 'Do android's dream of Electric Sheep'.",
        "I'll arrive between 1.00pm and 4.30pm.",
        "These are special characters #%^$@*&$#&^$",
        "Don't worry, be happy!",
        "This is just - an example, sentence with some; punctuation.",
        "12345 is a number.",
        "It's a beautiful day.",
        ]
    })

test2 = pd.DataFrame(test2_data)

test2_return = column_stemmatizer(test2['test_text'])

print(test2_return)



0    favorit book android dream electr sheep 
1                        arriv 1 00pm 4 30pm 
2                            special charact 
3                                worri happi 
4                    exampl sentenc punctuat 
5                               12345 number 
6                                 beauti day 
dtype: object
