Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited

In [12]:
import nltk
nltk.download('reuters') # Downloading corpus
nltk.download('stopwords') # Downloading stopwords
nltk.download('punkt') # Downloading tokenizer

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
from nltk.corpus import reuters

In [14]:
print(reuters.raw('test/15000')) # Example

TOWN AND COUNTRY JEWELRY MANUFACTURING &lt;TCJC>
  4thh qtr Feb 28
      Shr 46 cts vs 22 cts
      Net 2,139,034 vs 854,182
      Sales 30.8 mln vs 20.6 mln
      Avg shrs 5,280,854 vs 4,559,646
      Year
      Shr 1.34 dlrs vs 1.15 dlrs
      Net 5,935,117 vs 4,156,171
      Sales 107.2 mln vs 71.6 mln
      Avg shrs 5,281,387 vs 3,616,183
      NOTE: Town and Country Jewelry Manufacturing Corp.
  




In [0]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
 
stop_words = stopwords.words('english') + list(punctuation)
 
def tokenize(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]

In [16]:
# build the vocabulary in one pass
vocabulary = set()
for file_id in reuters.fileids():
    words = tokenize(reuters.raw(file_id))
    vocabulary.update(words)
 
vocabulary = list(vocabulary)
word_index = {w: index for index, w in enumerate(vocabulary)}
 
VOCABULARY_SIZE = len(vocabulary)
DOCUMENTS_COUNT = len(reuters.fileids())
 
print(VOCABULARY_SIZE, DOCUMENTS_COUNT)

51558 10788


In [17]:
import numpy as np

word_doc_count = np.zeros(VOCABULARY_SIZE)
for file_id in reuters.fileids():
    words = set(tokenize(reuters.raw(file_id)))
    indexes = [word_index[word] for word in words]
    word_doc_count[indexes] += 1.0
 
word_idf = np.log(DOCUMENTS_COUNT / (1 + word_doc_count).astype(float))

print(word_idf[word_index['town']])
print(word_idf[word_index['jewelry']])

6.88829441146125
7.494430215031565


In [18]:
print(word_idf[word_index['sales']])

2.1481166502152735


In [0]:
from six import string_types
 
def word_tf(word, document): 
    return float(document.count(word)) / len(document)
 
def tf_idf(word, document):
    document = tokenize(document)
 
    if word not in word_index:
        return .0
 
    return word_tf(word, document) * word_idf[word_index[word]]

In [20]:
print(tf_idf('jewelry', reuters.raw('test/15000')))

0.26765822196541306


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
tfidf = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, vocabulary=vocabulary)
 
# Fit the TfIdf model
tfidf.fit([reuters.raw(file_id) for file_id in reuters.fileids()])
 
# Transform a document into TfIdf coordinates
X = tfidf.transform([reuters.raw('test/15000')])

  'stop_words.' % sorted(inconsistent))


In [22]:
print(tf_idf('jewelry', reuters.raw('test/15000')))

0.26765822196541306
