In [None]:
import nltk
from collections import Counter

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
def tokenize(document):
    return word_tokenize(document)

def is_no_punctuation(word):
    return word.isalnum()

def is_no_stopword(word):
    return word.lower() not in set(stopwords.words('english'))

def stem(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

def categorize(words):
    tags = nltk.pos_tag(words)
    return [tag for word, tag in tags]

def lemmatize(words, tags):
    lemmatizer = WordNetLemmatizer()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    pos = [tag_dict.get(t[0].upper(), wordnet.NOUN) for t in tags]
    return [lemmatizer.lemmatize(w, pos=p) for w, p in zip(words, pos)]

def count(words):
    return Counter(words)

def sklearn_count_vect(data, **kwargs):
    count_vect = CountVectorizer(**kwargs)
    X_train_counts = count_vect.fit_transform(data)
    print(X_train_counts)
    print(count_vect.vocabulary_)
    return X_train_counts

def sklearn_tfidf_vect(data, **kwargs):
    vec = TfidfVectorizer(**kwargs)
    X_train_counts = vec.fit_transform(data)
    print(X_train_counts)
    print(vec.get_feature_names())
    return X_train_counts

In [None]:
document = "Almost before we knew it, we had left the ground. The unknown holds its grounds."
print('Document:', document)

tokens = tokenize(document)
print('Tokenized:', tokens)

words = [w for w in tokens if is_no_punctuation(w) and is_no_stopword(w)]
print('Stripped stopwords and punctuation:', words)

words = stem(words)
print('Stemming:', words)

tokens_pos = categorize(tokens)
words_pos = [pos for w, pos in zip(tokens, tokens_pos) if is_no_punctuation(w) and is_no_stopword(w)]
print('Word categories:', words_pos)

words = lemmatize(words, words_pos)
print('Lemmatization:', words)

bag_of_words = count(words)
print('Bag of words:', bag_of_words)

sklearn_count_vect([document])
sklearn_count_vect([" ".join(words)])
sklearn_tfidf_vect([" ".join(words)])
#sklearn_count_vect([" ".join(words)], ngram_range=(2,3))

In [None]:
import nltk
nltk.download('tagsets')
nltk.help.upenn_tagset()