In [1]:
# lets take some sample text
sample_text = "Does this thing really work? Lets see."

# Tokenization

In [1]:
# tokenize : to create separate words from the text
from nltk.tokenize import sent_tokenize , word_tokenize

In [3]:
# sent_tokenize : to break the given text into seencesnt
sent_tokenize(sample_text.lower())

['does this thing really work?', 'lets see.']

In [4]:
# word_tokenize : to break the given text into words
words = word_tokenize(sample_text.lower())
words

['does', 'this', 'thing', 'really', 'work', '?', 'lets', 'see', '.']

In [5]:
# nltk contains the list of stopwords of different languages
# they should be removed as they are present in all documents irrespective of topic of document.
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
# words contains punctuations like . , ?
# to get rid of these punctuation we have to combine it with the list of stop words
import string
punctuations = list(string.punctuation) 
stop = stop + punctuations

In [7]:
clean_words = []
for w in words:
    if w not in stop:
        clean_words.append(w)

In [8]:
clean_words

['thing', 'really', 'work', 'lets', 'see']

# Stemming

In [9]:
# Stemming : to get to the root word
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [10]:
stem_words = ['play' , 'played' , 'playing' ,'player', 'happier' , 'happying' , 'breifly']
stemmed_words = [ps.stem(w) for w in stem_words]
stemmed_words

['play', 'play', 'play', 'player', 'happier', 'happi', 'breifli']

In [11]:
# as we can see that PorterStemmr does a descent job but it is not very smart so we need some different approach 
# i.e. lemmatization

# Parts Of Speech

In [12]:
from nltk import pos_tag

In [13]:
# to import text document of state_union speech
from nltk.corpus import state_union
text = state_union.raw('2006-GWBush.txt')

In [14]:
text

'PRESIDENT GEORGE W. BUSH\'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King. (Applause.)\n\nPresident George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan. 31, 2006. White House photo by Eric DraperEvery time I\'m invited to this rostrum, I\'m humbled by the privilege, and mindful of the history we\'ve seen together. We have gathered under this Capitol dome in moments of national mourning and national achievement. We have serv

In [15]:
pos = pos_tag(word_tokenize(text))
pos

[('PRESIDENT', 'NNP'),
 ('GEORGE', 'NNP'),
 ('W.', 'NNP'),
 ('BUSH', 'NNP'),
 ("'S", 'POS'),
 ('ADDRESS', 'NNP'),
 ('BEFORE', 'IN'),
 ('A', 'NNP'),
 ('JOINT', 'NNP'),
 ('SESSION', 'NNP'),
 ('OF', 'IN'),
 ('THE', 'NNP'),
 ('CONGRESS', 'NNP'),
 ('ON', 'NNP'),
 ('THE', 'NNP'),
 ('STATE', 'NNP'),
 ('OF', 'IN'),
 ('THE', 'NNP'),
 ('UNION', 'NNP'),
 ('January', 'NNP'),
 ('31', 'CD'),
 (',', ','),
 ('2006', 'CD'),
 ('THE', 'NNP'),
 ('PRESIDENT', 'NNP'),
 (':', ':'),
 ('Thank', 'NNP'),
 ('you', 'PRP'),
 ('all', 'DT'),
 ('.', '.'),
 ('Mr.', 'NNP'),
 ('Speaker', 'NNP'),
 (',', ','),
 ('Vice', 'NNP'),
 ('President', 'NNP'),
 ('Cheney', 'NNP'),
 (',', ','),
 ('members', 'NNS'),
 ('of', 'IN'),
 ('Congress', 'NNP'),
 (',', ','),
 ('members', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Supreme', 'NNP'),
 ('Court', 'NNP'),
 ('and', 'CC'),
 ('diplomatic', 'JJ'),
 ('corps', 'NN'),
 (',', ','),
 ('distinguished', 'JJ'),
 ('guests', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('fellow', 'JJ'),
 ('citizens', 'NNS'

# Lemmatization

In [16]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# Lemmatizer needs POS tag to give different output for a word based on its context.
# But the problem is that pos_tag returns pos in different format whereas lemmatizer needs in different form

In [17]:
lemmatizer.lemmatize('better' , pos = 'a')

'good'

In [18]:
lemmatizer.lemmatize('good' , pos = 'a')

'good'

In [19]:
lemmatizer.lemmatize('painting' , pos = 'v')

'paint'

In [20]:
lemmatizer.lemmatize('painting' , pos = 'n')

'painting'

In [21]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN