In [1]:
import nltk

In [2]:
paragraph = """The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and 
statistical natural language processing (NLP) for English written in the Python programming language. It supports 
classification, tokenization, stemming, tagging, parsing, and semantic reasoning functionalities.[4] It was developed 
by Steven Bird and Edward Loper in the Department of Computer and Information Science at the University of Pennsylvania.
[5] NLTK includes graphical demonstrations and sample data. It is accompanied by a book that explains the underlying 
concepts behind the language processing tasks supported by the toolkit,[6] plus a cookbook.[7]

NLTK is intended to support research and teaching in NLP or closely related areas, including empirical linguistics, 
cognitive science, artificial intelligence, information retrieval, and machine learning.[8] NLTK has been used 
successfully as a teaching tool, as an individual study tool, and as a platform for prototyping and building research 
systems. There are 32 universities in the US and 25 countries using NLTK in their courses.
"""

In [10]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [11]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []

In [12]:
sentences

['The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and \nstatistical natural language processing (NLP) for English written in the Python programming language.',
 'It supports \nclassification, tokenization, stemming, tagging, parsing, and semantic reasoning functionalities.',
 '[4] It was developed \nby Steven Bird and Edward Loper in the Department of Computer and Information Science at the University of Pennsylvania.',
 '[5] NLTK includes graphical demonstrations and sample data.',
 'It is accompanied by a book that explains the underlying \nconcepts behind the language processing tasks supported by the toolkit,[6] plus a cookbook.',
 '[7]\n\nNLTK is intended to support research and teaching in NLP or closely related areas, including empirical linguistics, \ncognitive science, artificial intelligence, information retrieval, and machine learning.',
 '[8] NLTK has been used \nsuccessfully as a teaching tool, as an individual study t

In [13]:
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]',' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
corpus

['natur languag toolkit commonli nltk suit librari program symbol statist natur languag process nlp english written python program languag',
 'support classif token stem tag pars semant reason function',
 'develop steven bird edward loper depart comput inform scienc univers pennsylvania',
 'nltk includ graphic demonstr sampl data',
 'accompani book explain underli concept behind languag process task support toolkit plu cookbook',
 'nltk intend support research teach nlp close relat area includ empir linguist cognit scienc artifici intellig inform retriev machin learn',
 'nltk use success teach tool individu studi tool platform prototyp build research system',
 'univers us countri use nltk cours']

In [15]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]',' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [16]:
corpus

['natural language toolkit commonly nltk suite library program symbolic statistical natural language processing nlp english written python programming language',
 'support classification tokenization stemming tagging parsing semantic reasoning functionality',
 'developed steven bird edward loper department computer information science university pennsylvania',
 'nltk includes graphical demonstration sample data',
 'accompanied book explains underlying concept behind language processing task supported toolkit plus cookbook',
 'nltk intended support research teaching nlp closely related area including empirical linguistics cognitive science artificial intelligence information retrieval machine learning',
 'nltk used successfully teaching tool individual study tool platform prototyping building research system',
 'university u country using nltk course']

In [19]:
# Creating Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [20]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 2, 1, 1, 0, 0, 0,
        0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0

In [21]:
len(sentences)

8

In [22]:
len(X)

8

In [23]:
len(X[0])

78