In [1]:
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
docs = [
    'John has some cats',
    'Cats, being cats, eat fish.',
    'I ate a big fish.'
]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/soelapyaehtun/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
def init_nltk_resources():
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

In [11]:
init_nltk_resources()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/soelapyaehtun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/soelapyaehtun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/soelapyaehtun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/soelapyaehtun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [12]:
def preprocess(docs):
    clean_docs = []
    # get a lemmatizer object from NLTK
    lemma = WordNetLemmatizer()
    
    # get NLTK's list of stopwords
    stop_words = stopwords.words('english')
    # create a mapper that replaces punctuations (defined 
    # in string.punctuation) to an empty string 
    punc = str.maketrans('', '', string.punctuation)
    
    for doc in docs:
        # remove punctuation
        doc_no_punc = doc.translate(punc)
        # convert all characters to lowercase (normalization)
        words = doc_no_punc.lower().split()    
        # any word that is not found in NLTK's list of stopwords
        # is lemmatized to its root-form ('v' means 'verb')
        # and stored in the 'words' array
        words = [lemma.lemmatize(word, 'v')
                    for word in words if word not in stop_words]    
        # join each word in our list to form back a document
        clean_docs.append(' '.join(words))
    
    return clean_docs

In [13]:
def BOW(docs):
    bow = CountVectorizer()
    # toarray() transforms results in a sparse matrix form
    # to a dense matrix form
    feature_vectors = bow.fit_transform(docs).toarray()
    # returning both feature-vectors and feature-names. the 
    # feature-vectors are aligned with the feature-names (vocab)
    return feature_vectors, bow.get_feature_names_out()

In [14]:
def pretty_print(feat_vecs, feat_names):
    df = pd.DataFrame(data=feat_vecs,
            index=['doc1', 'doc2', 'doc3'],
            columns=feat_names)
    print(df)

In [None]:
init_nltk_resources()

In [16]:
docs = [
    'John has some cats.',
    'Cats, being cats, eat fish.',
    'I ate a big fish.'
]

clean_docs = preprocess(docs)
print(clean_docs)

['john cat', 'cat cat eat fish', 'eat big fish']


In [17]:
feat_vecs, feat_names = BOW(clean_docs)
pretty_print(feat_vecs, feat_names)

      big  cat  eat  fish  john
doc1    0    1    0     0     1
doc2    0    2    1     1     0
doc3    1    0    1     1     0
