In [1]:
import re
import numpy as np


In [2]:
text = """Natural Language Processing (NLP) is a subfield of linguistics, computer science,
and artificial intelligence concerned with the interactions between computers and human language.
It's used to analyze text, allowing machines to understand, interpret, and manipulate human language.
NLP has many real-world applications, including machine translation, sentiment analysis, and chatbots."""


In [5]:
tokens = np.array(re.findall(r"\b\w+\b", text))


In [6]:
tokens


array(['Natural', 'Language', 'Processing', 'NLP', 'is', 'a', 'subfield',
       'of', 'linguistics', 'computer', 'science', 'and', 'artificial',
       'intelligence', 'concerned', 'with', 'the', 'interactions',
       'between', 'computers', 'and', 'human', 'language', 'It', 's',
       'used', 'to', 'analyze', 'text', 'allowing', 'machines', 'to',
       'understand', 'interpret', 'and', 'manipulate', 'human',
       'language', 'NLP', 'has', 'many', 'real', 'world', 'applications',
       'including', 'machine', 'translation', 'sentiment', 'analysis',
       'and', 'chatbots'], dtype='<U12')

In [7]:
tokens_lower = np.char.lower(tokens)


In [8]:
tokens_lower

array(['natural', 'language', 'processing', 'nlp', 'is', 'a', 'subfield',
       'of', 'linguistics', 'computer', 'science', 'and', 'artificial',
       'intelligence', 'concerned', 'with', 'the', 'interactions',
       'between', 'computers', 'and', 'human', 'language', 'it', 's',
       'used', 'to', 'analyze', 'text', 'allowing', 'machines', 'to',
       'understand', 'interpret', 'and', 'manipulate', 'human',
       'language', 'nlp', 'has', 'many', 'real', 'world', 'applications',
       'including', 'machine', 'translation', 'sentiment', 'analysis',
       'and', 'chatbots'], dtype='<U12')

In [9]:
stop_words = set([
    "a","an","the","is","of","and","or","in","on","with","to","for",
    "its","it's","has","many","between","including","used"
])

mask = np.array([word not in stop_words for word in tokens_lower])
tokens_nostop = tokens_lower[mask]

In [11]:
tokens_nostop

array(['natural', 'language', 'processing', 'nlp', 'subfield',
       'linguistics', 'computer', 'science', 'artificial', 'intelligence',
       'concerned', 'interactions', 'computers', 'human', 'language',
       'it', 's', 'analyze', 'text', 'allowing', 'machines', 'understand',
       'interpret', 'manipulate', 'human', 'language', 'nlp', 'real',
       'world', 'applications', 'machine', 'translation', 'sentiment',
       'analysis', 'chatbots'], dtype='<U12')

In [10]:
def simple_stem(word):
    suffixes = ["ing","ed","es","s"]
    for suf in suffixes:
        if word.endswith(suf) and len(word) > len(suf) + 2:
            return word[:-len(suf)]
    return word

tokens_stemmed = np.array([simple_stem(word) for word in tokens_nostop])


In [12]:
tokens_stemmed

array(['natural', 'language', 'process', 'nlp', 'subfield', 'linguistic',
       'computer', 'science', 'artificial', 'intelligence', 'concern',
       'interaction', 'computer', 'human', 'language', 'it', 's',
       'analyze', 'text', 'allow', 'machin', 'understand', 'interpret',
       'manipulate', 'human', 'language', 'nlp', 'real', 'world',
       'application', 'machine', 'translation', 'sentiment', 'analysi',
       'chatbot'], dtype='<U12')