In [1]:
import nltk
from collections import Counter

In [2]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ckoerner/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ckoerner/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ckoerner/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ckoerner/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [7]:
def tokenize(document):
    return word_tokenize(document)

def is_no_punctuation(word):
    return word.isalnum()

def is_no_stopword(word):
    return word.lower() not in set(stopwords.words('english'))

def stem(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

def categorize(words):
    tags = nltk.pos_tag(words)
    return [tag for word, tag in tags]

def lemmatize(words, tags):
    lemmatizer = WordNetLemmatizer()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    pos = [tag_dict.get(t[0].upper(), wordnet.NOUN) for t in tags]
    return [lemmatizer.lemmatize(w, pos=p) for w, p in zip(words, pos)]

def count(words):
    return Counter(words)

def sklearn_count_vect(data, **kwargs):
    count_vect = CountVectorizer(**kwargs)
    X_train_counts = count_vect.fit_transform(data)
    print(X_train_counts)
    print(count_vect.vocabulary_)
    return X_train_counts

def sklearn_tfidf_vect(data, **kwargs):
    vec = TfidfVectorizer(**kwargs)
    X_train_counts = vec.fit_transform(data)
    print(X_train_counts)
    print(vec.get_feature_names())
    return X_train_counts

In [8]:
document = "Almost before we knew it, we had left the ground. The unknown holds its grounds."
print('Document:', document)

tokens = tokenize(document)
print('Tokenized:', tokens)

words = [w for w in tokens if is_no_punctuation(w) and is_no_stopword(w)]
print('Stripped stopwords and punctuation:', words)

words = stem(words)
print('Stemming:', words)

tokens_pos = categorize(tokens)
words_pos = [pos for w, pos in zip(tokens, tokens_pos) if is_no_punctuation(w) and is_no_stopword(w)]
print('Word categories:', words_pos)

words = lemmatize(words, words_pos)
print('Lemmatization:', words)

bag_of_words = count(words)
print('Bag of words:', bag_of_words)

sklearn_count_vect([document])
sklearn_count_vect([" ".join(words)])
sklearn_tfidf_vect([" ".join(words)])
#sklearn_count_vect([" ".join(words)], ngram_range=(2,3))

Document: Almost before we knew it, we had left the ground. The unknown holds its grounds.
Tokenized: ['Almost', 'before', 'we', 'knew', 'it', ',', 'we', 'had', 'left', 'the', 'ground', '.', 'The', 'unknown', 'holds', 'its', 'grounds', '.']
Stripped stopwords and punctuation: ['Almost', 'knew', 'left', 'ground', 'unknown', 'holds', 'grounds']
Stemming: ['almost', 'knew', 'left', 'ground', 'unknown', 'hold', 'ground']
Word categories: ['RB', 'VBD', 'VBN', 'NN', 'JJ', 'VBZ', 'NNS']
Lemmatization: ['almost', 'know', 'leave', 'ground', 'unknown', 'hold', 'ground']
Bag of words: Counter({'ground': 2, 'almost': 1, 'know': 1, 'leave': 1, 'unknown': 1, 'hold': 1})
  (0, 0)	1
  (0, 1)	1
  (0, 12)	2
  (0, 8)	1
  (0, 6)	1
  (0, 4)	1
  (0, 9)	1
  (0, 10)	2
  (0, 2)	1
  (0, 11)	1
  (0, 5)	1
  (0, 7)	1
  (0, 3)	1
{'almost': 0, 'before': 1, 'we': 12, 'knew': 8, 'it': 6, 'had': 4, 'left': 9, 'the': 10, 'ground': 2, 'unknown': 11, 'holds': 5, 'its': 7, 'grounds': 3}
  (0, 0)	1
  (0, 3)	1
  (0, 4)	1
  (

<1x6 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [10]:
import nltk
nltk.download('tagsets')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/ckoerner/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
