In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(u"Heal the world, make it a better place for you and for me and the entire human race")
print(doc)

Heal the world, make it a better place for you and for me and the entire human race


In [3]:
import nltk
nltk.download("averaged_perceptron_tagger")
from nltk import pos_tag
from nltk import RegexpParser

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
a = "Heal the world, make it a better place for you and for me and the entire human race"
Pos = a.split()
b = pos_tag(Pos)
b

[('Heal', 'VB'),
 ('the', 'DT'),
 ('world,', 'NNS'),
 ('make', 'VBP'),
 ('it', 'PRP'),
 ('a', 'DT'),
 ('better', 'JJR'),
 ('place', 'NN'),
 ('for', 'IN'),
 ('you', 'PRP'),
 ('and', 'CC'),
 ('for', 'IN'),
 ('me', 'PRP'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('entire', 'JJ'),
 ('human', 'NN'),
 ('race', 'NN')]

In [5]:
pattern = """chunk:{<NN.?>*<VB.?>*<PRP.?>}"""
chunke = RegexpParser(pattern)
print(chunke)

chunk.RegexpParser with 1 stages:
RegexpChunkParser with 1 rules:
       <ChunkRule: '<NN.?>*<VB.?>*<PRP.?>'>


In [6]:
print(chunke.parse(b))

(S
  Heal/VB
  the/DT
  (chunk world,/NNS make/VBP it/PRP)
  a/DT
  better/JJR
  place/NN
  for/IN
  (chunk you/PRP)
  and/CC
  for/IN
  (chunk me/PRP)
  and/CC
  the/DT
  entire/JJ
  human/NN
  race/NN)


In [7]:
help(nltk.snowball)

Help on module nltk.stem.snowball in nltk.stem:

NAME
    nltk.stem.snowball - Snowball stemmers

DESCRIPTION
    This module provides a port of the Snowball stemmers
    developed by Martin Porter.
    
    There is also a demo function: `snowball.demo()`.

CLASSES
    nltk.stem.api.StemmerI(builtins.object)
        SnowballStemmer
    nltk.stem.porter.PorterStemmer(nltk.stem.api.StemmerI)
        PorterStemmer(_LanguageSpecificStemmer, nltk.stem.porter.PorterStemmer)
    _LanguageSpecificStemmer(nltk.stem.api.StemmerI)
        HungarianStemmer
        PorterStemmer(_LanguageSpecificStemmer, nltk.stem.porter.PorterStemmer)
        RussianStemmer
    _ScandinavianStemmer(_LanguageSpecificStemmer)
        DanishStemmer
        NorwegianStemmer
        SwedishStemmer
    _StandardStemmer(_LanguageSpecificStemmer)
        ArabicStemmer
        DutchStemmer
        EnglishStemmer
        FinnishStemmer
        FrenchStemmer
        GermanStemmer
        ItalianStemmer
        PortugueseSte

In [8]:
nltk.download('chat80')

[nltk_data] Downloading package chat80 to /root/nltk_data...
[nltk_data]   Unzipping corpora/chat80.zip.


True

In [9]:
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('reuters')
stop_words = stopwords.words('english')

stop_words1 = stop_words + list(punctuation)

def tokenize(text):
  words  = word_tokenize(text)
  # print(words)
  words = [w.lower() for w in words]
  return[w for w in words if w not in stop_words1 and not w.isdigit()]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...


In [10]:
from nltk.corpus import reuters
print(reuters.raw('test/15000'))

TOWN AND COUNTRY JEWELRY MANUFACTURING &lt;TCJC>
  4thh qtr Feb 28
      Shr 46 cts vs 22 cts
      Net 2,139,034 vs 854,182
      Sales 30.8 mln vs 20.6 mln
      Avg shrs 5,280,854 vs 4,559,646
      Year
      Shr 1.34 dlrs vs 1.15 dlrs
      Net 5,935,117 vs 4,156,171
      Sales 107.2 mln vs 71.6 mln
      Avg shrs 5,281,387 vs 3,616,183
      NOTE: Town and Country Jewelry Manufacturing Corp.
  




In [11]:
import random
random.seed(9)
vocab = set()
for field_id in reuters.fileids():
  words = tokenize(reuters.raw(field_id))
  vocab.update(words)

vocab = list(vocab)
word_index = {w:index for index,w in enumerate(vocab)}
# print("word_index:", word_index)

VOCAB_SIZE = len(vocab)
DOC_size = len(reuters.fileids())

print(VOCAB_SIZE, DOC_size)
print("word_index:", word_index["outokumpu's"])

51516 10788
word_index: 21440


In [12]:
import numpy as np

word_idf = np.zeros(VOCAB_SIZE)
for file_id in reuters.fileids():
  words = set(tokenize(reuters.raw(file_id)))
  indexes = [word_index[word] for word in words]
  # print("indexes:", indexes)
  # print("word_idf[indexes]:", word_idf[indexes])
  word_idf[indexes] += 1.0
  print("word_idf:", word_idf)

word_idf = np.log(DOC_size/ (1 + word_idf).astype(float))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1.  3.  4. ... 24.  4.  0.]
word_idf: [ 1. 

In [13]:
print(word_idf[word_index['agreeeement']])

8.593042503699674


In [14]:
print(word_idf)

[8.5930425  7.34027954 7.08896511 ... 5.33494597 7.20674814 8.5930425 ]


In [15]:
a = np.array([1,2,3,4])

In [16]:
abc = np.log(1500/a)

In [17]:
print(abc)

[7.31322039 6.62007321 6.2146081  5.92692603]


In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [19]:
text = "Make it a better better better place, for you and for me and the entire human race"
t = Tokenizer()
t.fit_on_texts(text)
print(t)
text1 = "for you and for me and the entire human race"
seq = t.texts_to_sequences(text1)

print("sequences : ",seq,'\n')

print("word_index : ",t.word_index)

<keras_preprocessing.text.Tokenizer object at 0x7fc5cde624d0>
sequences :  [[11], [8], [4], [], [18], [8], [12], [], [3], [5], [13], [], [11], [8], [4], [], [6], [1], [], [3], [5], [13], [], [2], [14], [1], [], [1], [5], [2], [9], [4], [1], [], [14], [12], [6], [3], [5], [], [4], [3], [10], [1]] 

word_index :  {'e': 1, 't': 2, 'a': 3, 'r': 4, 'n': 5, 'm': 6, 'b': 7, 'o': 8, 'i': 9, 'c': 10, 'f': 11, 'u': 12, 'd': 13, 'h': 14, 'k': 15, 'p': 16, 'l': 17, 'y': 18}


In [22]:
a = 1500
b = 1400
c = np.log (a/b)

In [23]:
c

0.06899287148695142