In [2]:
from nltk.corpus import reuters
from nltk.tokenize import sent_tokenize, word_tokenize

## The NLTK Reuters corpus

In [6]:
# The reuters corpus includes over 10,000 news articles, many of which are about financial markets
# These articles are tagged by topic, or category
import nltk
nltk.download('reuters')

print(reuters.categories())

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\TribThapa\AppData\Roaming\nltk_data...


['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [19]:
# We'll find the first article about crude oil
oil_article1 = reuters.fileids(categories='crude')[0]

oil_article1

'test/14829'

In [17]:
oil_article2 = reuters.fileids(categories='crude')[1]

oil_article2

'test/15063'

In [20]:
article1 = reuters.raw(oil_article1)

print(article1)

JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through deliberations of committee meetings of the
  Agency of Natural Resources and Energy, the officials said.
      They said MITI will also review the breakdown of energy
  supply sources, including oil, nuclear, coal and natural gas.
      Nuclear energy provided the bulk of Japan's electric power
  in the fiscal year ended March

In [21]:
article2 = reuters.raw(oil_article2)

print(article2)

ENERGY/U.S. PETROCHEMICAL INDUSTRY
  Cheap oil feedstocks, the weakened U.S.
  dollar and a plant utilization rate approaching 90 pct will
  propel the streamlined U.S. petrochemical industry to record
  profits this year, with growth expected through at least 1990,
  major company executives predicted.
      This bullish outlook for chemical manufacturing and an
  industrywide move to shed unrelated businesses has prompted GAF
  Corp &lt;GAF>, privately-held Cain Chemical Inc, and other firms
  to aggressively seek acquisitions of petrochemical plants.
      Oil companies such as Ashland Oil Inc &lt;ASH>, the
  Kentucky-based oil refiner and marketer, are also shopping for
  money-making petrochemical businesses to buy.
      "I see us poised at the threshold of a golden period," said
  Paul Oreffice, chairman of giant Dow Chemical Co &lt;DOW>, adding,
  "There's no major plant capacity being added around the world
  now. The whole game is bringing out new products and improving
  the

## Tokenizing with string splits

In [22]:
# Simple sentence tokenizing with string split

article1.split('.')

['JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS\n  The Ministry of International Trade and\n  Industry (MITI) will revise its long-term energy supply/demand\n  outlook by August to meet a forecast downtrend in Japanese\n  energy demand, ministry officials said',
 '\n      MITI is expected to lower the projection for primary energy\n  supplies in the year 2000 to 550 mln kilolitres (kl) from 600\n  mln, they said',
 '\n      The decision follows the emergence of structural changes in\n  Japanese industry following the rise in the value of the yen\n  and a decline in domestic electric power demand',
 '\n      MITI is planning to work out a revised energy supply/demand\n  outlook through deliberations of committee meetings of the\n  Agency of Natural Resources and Energy, the officials said',
 '\n      They said MITI will also review the breakdown of energy\n  supply sources, including oil, nuclear, coal and natural gas',
 "\n      Nuclear energy provided the bulk of Japan's electric 

In [25]:
# Word tokenizing with string split

sent = article1.split('.')[0]

print(sent.split(' '))

['JAPAN', 'TO', 'REVISE', 'LONG-TERM', 'ENERGY', 'DEMAND', 'DOWNWARDS\n', '', 'The', 'Ministry', 'of', 'International', 'Trade', 'and\n', '', 'Industry', '(MITI)', 'will', 'revise', 'its', 'long-term', 'energy', 'supply/demand\n', '', 'outlook', 'by', 'August', 'to', 'meet', 'a', 'forecast', 'downtrend', 'in', 'Japanese\n', '', 'energy', 'demand,', 'ministry', 'officials', 'said']


In [24]:
sent = article1.split('.')[1]

print(sent.split(' '))

['\n', '', '', '', '', '', 'MITI', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', 'primary', 'energy\n', '', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres', '(kl)', 'from', '600\n', '', 'mln,', 'they', 'said']


## NLTK tokenization

In [27]:
# Now using NLTK's sent_tokenize function
nltk.download('punkt')

sent_tokenize(article1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TribThapa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS\n  The Ministry of International Trade and\n  Industry (MITI) will revise its long-term energy supply/demand\n  outlook by August to meet a forecast downtrend in Japanese\n  energy demand, ministry officials said.',
 'MITI is expected to lower the projection for primary energy\n  supplies in the year 2000 to 550 mln kilolitres (kl) from 600\n  mln, they said.',
 'The decision follows the emergence of structural changes in\n  Japanese industry following the rise in the value of the yen\n  and a decline in domestic electric power demand.',
 'MITI is planning to work out a revised energy supply/demand\n  outlook through deliberations of committee meetings of the\n  Agency of Natural Resources and Energy, the officials said.',
 'They said MITI will also review the breakdown of energy\n  supply sources, including oil, nuclear, coal and natural gas.',
 "Nuclear energy provided the bulk of Japan's electric power\n  in the fiscal year ended M

In [28]:
#...and word_tokenize function
print(word_tokenize(sent))

['JAPAN', 'TO', 'REVISE', 'LONG-TERM', 'ENERGY', 'DEMAND', 'DOWNWARDS', 'The', 'Ministry', 'of', 'International', 'Trade', 'and', 'Industry', '(', 'MITI', ')', 'will', 'revise', 'its', 'long-term', 'energy', 'supply/demand', 'outlook', 'by', 'August', 'to', 'meet', 'a', 'forecast', 'downtrend', 'in', 'Japanese', 'energy', 'demand', ',', 'ministry', 'officials', 'said']
