In [1]:
!pip install nltk



In [2]:
paragraph = """
A short story is a piece of prose fiction.
It can typically be read in a single sitting and focuses on a self-contained incident or series of linked incidents, with the intent of evoking a single effect or mood. The short story is one of the oldest types of literature and has existed in the form of legends, mythic tales, folk tales, fairy tales, tall tales, fables, and anecdotes in various ancient communities around the world. The modern short story developed in the early 19th century
"""

In [3]:
paragraph

'\nA short story is a piece of prose fiction.\nIt can typically be read in a single sitting and focuses on a self-contained incident or series of linked incidents, with the intent of evoking a single effect or mood. The short story is one of the oldest types of literature and has existed in the form of legends, mythic tales, folk tales, fairy tales, tall tales, fables, and anecdotes in various ancient communities around the world. The modern short story developed in the early 19th century\n'

In [4]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [5]:
# tokenization -- paragraph to sentence words
nltk.download('punkt')
sentences = nltk.sent_tokenize(paragraph)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
sentences

['\nA short story is a piece of prose fiction.',
 'It can typically be read in a single sitting and focuses on a self-contained incident or series of linked incidents, with the intent of evoking a single effect or mood.',
 'The short story is one of the oldest types of literature and has existed in the form of legends, mythic tales, folk tales, fairy tales, tall tales, fables, and anecdotes in various ancient communities around the world.',
 'The modern short story developed in the early 19th century']

In [7]:
stemmer = PorterStemmer() # object creation
stemmer.stem('Going')

'go'

In [8]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('going')

[nltk_data] Downloading package wordnet to /root/nltk_data...


'going'

In [9]:
# clean paragraph, remove special characters and lower them
import re
corpus=[]
for i in range(len(sentences)):
  review = re.sub('[^a-zA-Z]',' ',sentences[i])
  review = review.lower()
  corpus.append(review)

In [10]:
corpus

[' a short story is a piece of prose fiction ',
 'it can typically be read in a single sitting and focuses on a self contained incident or series of linked incidents  with the intent of evoking a single effect or mood ',
 'the short story is one of the oldest types of literature and has existed in the form of legends  mythic tales  folk tales  fairy tales  tall tales  fables  and anecdotes in various ancient communities around the world ',
 'the modern short story developed in the early   th century']

In [11]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
## stemming
for i in corpus:
  words = nltk.word_tokenize(i)
  for word in words:
    if word not in set(stopwords.words('english')):
      print(stemmer.stem(word))


short
stori
piec
prose
fiction
typic
read
singl
sit
focus
self
contain
incid
seri
link
incid
intent
evok
singl
effect
mood
short
stori
one
oldest
type
literatur
exist
form
legend
mythic
tale
folk
tale
fairi
tale
tall
tale
fabl
anecdot
variou
ancient
commun
around
world
modern
short
stori
develop
earli
th
centuri


In [13]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
## Lemmatization
for i in corpus:
  words = nltk.word_tokenize(i)
  for word in words:
    if word not in set(stopwords.words('english')):
      print(lemmatizer.lemmatize(word))


short
story
piece
prose
fiction
typically
read
single
sitting
focus
self
contained
incident
series
linked
incident
intent
evoking
single
effect
mood
short
story
one
oldest
type
literature
existed
form
legend
mythic
tale
folk
tale
fairy
tale
tall
tale
fable
anecdote
various
ancient
community
around
world
modern
short
story
developed
early
th
century


In [15]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [16]:
x = cv.fit_transform(corpus)

In [17]:
cv.vocabulary_

{'short': 43,
 'story': 46,
 'is': 25,
 'piece': 38,
 'of': 33,
 'prose': 39,
 'fiction': 16,
 'it': 26,
 'can': 5,
 'typically': 52,
 'be': 4,
 'read': 40,
 'in': 21,
 'single': 44,
 'sitting': 45,
 'and': 1,
 'focuses': 17,
 'on': 35,
 'self': 41,
 'contained': 8,
 'incident': 22,
 'or': 37,
 'series': 42,
 'linked': 28,
 'incidents': 23,
 'with': 54,
 'the': 50,
 'intent': 24,
 'evoking': 12,
 'effect': 11,
 'mood': 31,
 'one': 36,
 'oldest': 34,
 'types': 51,
 'literature': 29,
 'has': 20,
 'existed': 13,
 'form': 19,
 'legends': 27,
 'mythic': 32,
 'tales': 47,
 'folk': 18,
 'fairy': 15,
 'tall': 48,
 'fables': 14,
 'anecdotes': 2,
 'various': 53,
 'ancient': 0,
 'communities': 7,
 'around': 3,
 'world': 55,
 'modern': 30,
 'developed': 9,
 'early': 10,
 'th': 49,
 'century': 6}

In [18]:
corpus[0]

' a short story is a piece of prose fiction '

In [19]:
x[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [20]:
# Bag of words without stopwords
ans = []
for i in range(len(sentences)):
  review = re.sub('[^a-zA-Z]',' ',sentences[i])
  review = review.lower()
  review = review.split()
  review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
  ans.append(review)

In [21]:
ans

[['short', 'story', 'piece', 'prose', 'fiction'],
 ['typically',
  'read',
  'single',
  'sitting',
  'focus',
  'self',
  'contained',
  'incident',
  'series',
  'linked',
  'incident',
  'intent',
  'evoking',
  'single',
  'effect',
  'mood'],
 ['short',
  'story',
  'one',
  'oldest',
  'type',
  'literature',
  'existed',
  'form',
  'legend',
  'mythic',
  'tale',
  'folk',
  'tale',
  'fairy',
  'tale',
  'tall',
  'tale',
  'fable',
  'anecdote',
  'various',
  'ancient',
  'community',
  'around',
  'world'],
 ['modern', 'short', 'story', 'developed', 'early', 'th', 'century']]

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
x = cv.fit_transform(corpus)

In [27]:
corpus[0]

' a short story is a piece of prose fiction '

In [28]:
x[0].toarray() # vector creation

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.45436601, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.35822738, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.29001594, 0.        ,
        0.        , 0.        , 0.        , 0.45436601, 0.45436601,
        0.        , 0.        , 0.        , 0.29001594, 0.        ,
        0.        , 0.29001594, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [30]:
# Trigram or ngrams
cv2 = TfidfVectorizer(ngram_range = (3,3))
x2 = cv.fit_transform(corpus)

In [32]:
x2[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.45436601, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.35822738, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.29001594, 0.        ,
        0.        , 0.        , 0.        , 0.45436601, 0.45436601,
        0.        , 0.        , 0.        , 0.29001594, 0.        ,
        0.        , 0.29001594, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [34]:
# make tfidf for top 3 max features words
cv2 = TfidfVectorizer(ngram_range = (3,3),max_features=3)
x2 = cv.fit_transform(corpus)

In [36]:
x2[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.45436601, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.35822738, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.29001594, 0.        ,
        0.        , 0.        , 0.        , 0.45436601, 0.45436601,
        0.        , 0.        , 0.        , 0.29001594, 0.        ,
        0.        , 0.29001594, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])