In [1]:
import nltk

# sentence tokenization

In [2]:
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
sentences = nltk.sent_tokenize(text)

In [3]:
sentences

['Backgammon is one of the oldest known board games.',
 'Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.',
 'It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.']

In [4]:
for sentence in sentences:
     print(sentence)

Backgammon is one of the oldest known board games.
Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.
It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.


# word tokenization

In [5]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']
['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']
['It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']


In [6]:
from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yamuna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
print(sentence)

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.


# stemming

In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
ps = PorterStemmer() 
    
for w in words:
    print(w, " : ", ps.stem(w))

It  :  It
is  :  is
a  :  a
two  :  two
player  :  player
game  :  game
where  :  where
each  :  each
player  :  player
has  :  ha
fifteen  :  fifteen
checkers  :  checker
which  :  which
move  :  move
between  :  between
twenty-four  :  twenty-four
points  :  point
according  :  accord
to  :  to
the  :  the
roll  :  roll
of  :  of
two  :  two
dice  :  dice
.  :  .


# lematization & POS Tagging

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yamuna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
print(nltk.pos_tag(nltk.word_tokenize(sentence)))          


[('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('two', 'CD'), ('player', 'NN'), ('game', 'NN'), ('where', 'WRB'), ('each', 'DT'), ('player', 'NN'), ('has', 'VBZ'), ('fifteen', 'VBN'), ('checkers', 'NNS'), ('which', 'WDT'), ('move', 'VBP'), ('between', 'IN'), ('twenty-four', 'NN'), ('points', 'NNS'), ('according', 'VBG'), ('to', 'TO'), ('the', 'DT'), ('roll', 'NN'), ('of', 'IN'), ('two', 'CD'), ('dice', 'NNS'), ('.', '.')]


In [12]:
import nltk
from nltk.stem import WordNetLemmatizer 

In [13]:
sentence = "Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East."
word_list = nltk.word_tokenize(sentence)
print(word_list)
# Lemmatize list of words and join
lemmatized_output = '.join([lemmatizer.lemmatize(w) for w in word_list])'
print(lemmatized_output)

['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']
.join([lemmatizer.lemmatize(w) for w in word_list])
