### Tokenize text into sentences

In [1]:
para = "Hi! My name is Jane. What's your name?"
from nltk.tokenize import sent_tokenize
sent_tokenize(para)

['Hi!', 'My name is Jane.', "What's your name?"]

### Tokenize sentences into words

In [2]:
from nltk.tokenize import word_tokenize #选择这个方法
word_tokenize('Hello world.')

['Hello', 'world', '.']

In [3]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('hello world?')

['hello', 'world', '?']

In [5]:
import nltk
text = "Hello, my friend."
pattern = r"\w+|[^\w\s]+"

In [8]:
print (nltk.tokenize.regexp_tokenize(text,pattern))

['Hello', ',', 'my', 'friend', '.']


### Tokenize sentences using regular expressions

In [10]:
from nltk.tokenize import RegexpTokenizer 
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction']

In [11]:
from nltk.tokenize import RegexpTokenizer 
tokenizer = RegexpTokenizer("[\w]+") 
#这里没有了"’"，所以can't分开了
tokenizer.tokenize("Can't is a contraction.")

['Can', 't', 'is', 'a', 'contraction']

In [14]:
from nltk.tokenize import regexp_tokenize
regexp_tokenize("I won't be there!","[\w']") #少了+

['I', 'w', 'o', 'n', "'", 't', 'b', 'e', 't', 'h', 'e', 'r', 'e']

In [15]:
# 这个方法比较简单
from nltk.tokenize import regexp_tokenize
regexp_tokenize("I won't be there!","[\w']+")

['I', "won't", 'be', 'there']

### Training a sentence tokenizer

In [28]:
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import sent_tokenize
#from nltk.corpus import webtext
text ="""White guy: So, do you have any plans for this evening?
Girl: But you already have a Big Mac...
Hobo: Oh, this is all!
"""
sent_tokenizer=PunktSentenceTokenizer(text)
sent1=sent_tokenizer.tokenize(text)

sent2=sent_tokenize(text)

In [29]:
sent1[1]

'Girl: But you already have a Big Mac...\nHobo: Oh, this is all!\n'

In [30]:
sent2[1]

'Girl: But you already have a Big Mac...\nHobo: Oh, this is all!\n'

In [31]:
sent1[0]

'White guy: So, do you have any plans for this evening?'

In [32]:
sent2[0]

'White guy: So, do you have any plans for this evening?'

In [33]:
sent1[2]

IndexError: list index out of range

### Filtering stopwords in a tokenized sentence

In [34]:
from nltk.corpus import stopwords
english_stops=set(stopwords.words('english'))
words = ['I',"Won't",'Be','There']
[word for word in words if word not in english_stops]

['I', "Won't", 'Be', 'There']

In [35]:
from nltk.corpus import stopwords
english_stops=set(stopwords.words('english'))
words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops]

["Can't", 'contraction']

In [36]:
from nltk.corpus import stopwords
english_stops=set(stopwords.words('english'))
words = ['I',"have",'$3']
[word for word in words if word not in english_stops]

['I', '$3']

### Look up synsets 同义词集

In [37]:
from nltk.corpus import wordnet
syn=wordnet.synsets('cookbook')[0]
syn.name()

'cookbook.n.01'

In [39]:
syn.definition()

'a book of recipes and cooking directions'

In [42]:
from nltk.corpus import wordnet
syn=wordnet.synsets('motorcar')[0]
syn.name()

'car.n.01'

In [43]:
# wordnet 层级结构
from nltk.corpus import wordnet as wn
motorcar = wn.synset('car.n.01')
types_of_motorcar=motorcar.hyponyms()
types_of_motorcar

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [47]:
from nltk.corpus import wordnet as wn
wn.synset('motorcar.n.01').hyponyms()

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [44]:
# 部分整体
from nltk.corpus import wordnet as wn
wn.synset('tree.n.01').part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [53]:
# 反义词
from nltk.corpus import wordnet as wn
wn.lemma('short.a.01.short').antonyms()

[Lemma('long.a.01.long')]

In [3]:
from nltk.corpus import wordnet as wn
dir(wn.synset('beautiful.a.01'))

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_hypernyms',
 '_definition',
 '_examples',
 '_frame_ids',
 '_hypernyms',
 '_instance_hypernyms',
 '_iter_hypernym_lists',
 '_lemma_names',
 '_lemma_pointers',
 '_lemmas',
 '_lexname',
 '_max_depth',
 '_min_depth',
 '_name',
 '_needs_root',
 '_offset',
 '_pointers',
 '_pos',
 '_related',
 '_shortest_hypernym_paths',
 '_wordnet_corpus_reader',
 'also_sees',
 'attributes',
 'causes',
 'closure',
 'common_hypernyms',
 'definition',
 'entailments',
 'examples',
 'frame_ids',
 'hypernym_distances',
 'hypernym_paths',
 'hypernyms',
 'hyponyms',
 'instance_hypernyms',
 'instance_hyponyms',
 'jcn_similarity',
 'lch_si

In [6]:
from nltk.corpus import wordnet
syn=wordnet.synsets('motorcar')[0]
syn.pos()

'n'

In [7]:
from nltk.corpus import wordnet as wn
wn.synset('brave.a.01').lemma_names()

['brave', 'courageous']

In [9]:
from nltk.corpus import wordnet as wn
wn.synset('brave').lemma_names()

ValueError: not enough values to unpack (expected 3, got 1)

In [8]:
wn.synset('brave.a.01').definition()

'possessing or displaying courage; able to face and deal with danger or fear without flinching'

In [14]:
dir(wn.synset)

['__call__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__func__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__self__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [15]:
dir(wn.synset('brave.a.01'))

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_hypernyms',
 '_definition',
 '_examples',
 '_frame_ids',
 '_hypernyms',
 '_instance_hypernyms',
 '_iter_hypernym_lists',
 '_lemma_names',
 '_lemma_pointers',
 '_lemmas',
 '_lexname',
 '_max_depth',
 '_min_depth',
 '_name',
 '_needs_root',
 '_offset',
 '_pointers',
 '_pos',
 '_related',
 '_shortest_hypernym_paths',
 '_wordnet_corpus_reader',
 'also_sees',
 'attributes',
 'causes',
 'closure',
 'common_hypernyms',
 'definition',
 'entailments',
 'examples',
 'frame_ids',
 'hypernym_distances',
 'hypernym_paths',
 'hypernyms',
 'hyponyms',
 'instance_hypernyms',
 'instance_hyponyms',
 'jcn_similarity',
 'lch_si

### calculating wordnet synset similarity

In [16]:
from nltk.corpus import wordnet as wn
wn.synset('right_whale.n.01').definition

<bound method Synset.definition of Synset('right_whale.n.01')>

In [17]:
from nltk.corpus import wordnet as wn
wn.synset('right_whale.n.01').definition()

"large Arctic whalebone whale; allegedly the `right' whale to hunt because of its valuable whalebone and oil"

In [18]:
wn.synset('minke_whale.n.01').definition()

'small finback of coastal waters of Atlantic and Pacific'

In [19]:
wn.synset('whale.n.01').lemmas()

[Lemma('giant.n.04.giant'),
 Lemma('giant.n.04.hulk'),
 Lemma('giant.n.04.heavyweight'),
 Lemma('giant.n.04.whale')]

In [25]:
help(wn.synset.lemmas)

AttributeError: 'function' object has no attribute 'lemmas'

In [26]:
help(wn.synset('whale.n.01').lemmas) #lemma词根、词元

Help on method lemmas in module nltk.corpus.reader.wordnet:

lemmas(lang='eng') method of nltk.corpus.reader.wordnet.Synset instance
    Return all the lemma objects associated with the synset



In [27]:
wn.synset('verify.v.01').hyponyms()

[Synset('check.v.06'), Synset('check.v.22'), Synset('see.v.10')]

In [28]:
wn.synset('verify.v.01').definition()

'confirm the truth of'

In [29]:
wn.synset('verify.v.02').definition()

'check or regulate (a scientific experiment) by conducting a parallel experiment or comparing with another standard'

In [30]:
wn.synset('verify').definition()

ValueError: not enough values to unpack (expected 3, got 1)

In [33]:
right = wn.synset('right_whale.n.01')
minke = wn.synset('minke_whale.n.01')
right.path_similarity(minke)

0.25

In [36]:
g=wn.synset('big.a.01')
h=wn.synset('large.a.01')
print (g.path_similarity(h))

1.0


In [37]:
a=wn.synset('apple.n.01')
b=wn.synset('bear.n.01')
a.path_similarity(b)

0.058823529411764705

In [38]:
a=wn.synset('apple.n.01')
b=wn.synset('bear.n.01')
a.wup_similarity(b)

0.3333333333333333

In [39]:
f=wn.synset('fruit.n.01')
a.path_similarity(f)

0.3333333333333333

In [40]:
f=wn.synset('fruit.n.01')
a.wup_similarity(f)

0.9

In [41]:
nb=wn.synset('notebook.n.01')
ib=wn.synset('instruction_book.n.01')
nb.wup_similarity(ib)

0.6666666666666666

### discovering word collocations

In [42]:
from nltk import bigrams
a="Beijing Language and Culture University"
tokens=a.split()
bigrams(tokens)

<generator object bigrams at 0x0000029CA4615B48>

In [2]:
from nltk import bigrams
a="Beijing Language and Culture University"
tokens=a.split()
tokens

['Beijing', 'Language', 'and', 'Culture', 'University']

In [3]:
bigrams(tokens)

<generator object bigrams at 0x000001F70B6D5888>

In [4]:
list(bigrams(tokens)) #这里必须要转化成list

[('Beijing', 'Language'),
 ('Language', 'and'),
 ('and', 'Culture'),
 ('Culture', 'University')]

### 词频统计

In [5]:
from nltk.book import text1

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [6]:
from nltk import FreqDist

In [7]:
fdist1=FreqDist(text1)

In [8]:
print (fdist1)

<FreqDist with 19317 samples and 260819 outcomes>


In [9]:
fdist1

FreqDist({'palmy': 3,
          'ditto': 1,
          'trace': 5,
          'shrunk': 1,
          'swamped': 1,
          '1840': 1,
          'relied': 1,
          'Socratic': 1,
          'insanity': 6,
          'crackled': 1,
          'tonic': 1,
          'chronically': 1,
          'Knights': 2,
          'terse': 1,
          'BACK': 5,
          'folds': 6,
          'tidiness': 1,
          'needs': 20,
          'journeyman': 1,
          'bay': 4,
          'accounted': 9,
          'listened': 1,
          'Gods': 3,
          'compassion': 1,
          'frontier': 1,
          'cleats': 1,
          'enlisting': 1,
          'ruddy': 3,
          'temperate': 3,
          'laughing': 4,
          'floated': 18,
          'jointed': 1,
          'playful': 3,
          'confabulations': 1,
          'religionists': 1,
          'expressions': 2,
          'eider': 1,
          'muttered': 17,
          'fiery': 23,
          'woody': 1,
          'peering': 7,
          

In [10]:
fdist1.most_common(50)

[(',', 18713),
 ('the', 13721),
 ('.', 6862),
 ('of', 6536),
 ('and', 6024),
 ('a', 4569),
 ('to', 4542),
 (';', 4072),
 ('in', 3916),
 ('that', 2982),
 ("'", 2684),
 ('-', 2552),
 ('his', 2459),
 ('it', 2209),
 ('I', 2124),
 ('s', 1739),
 ('is', 1695),
 ('he', 1661),
 ('with', 1659),
 ('was', 1632),
 ('as', 1620),
 ('"', 1478),
 ('all', 1462),
 ('for', 1414),
 ('this', 1280),
 ('!', 1269),
 ('at', 1231),
 ('by', 1137),
 ('but', 1113),
 ('not', 1103),
 ('--', 1070),
 ('him', 1058),
 ('from', 1052),
 ('be', 1030),
 ('on', 1005),
 ('so', 918),
 ('whale', 906),
 ('one', 889),
 ('you', 841),
 ('had', 767),
 ('have', 760),
 ('there', 715),
 ('But', 705),
 ('or', 697),
 ('were', 680),
 ('now', 646),
 ('which', 640),
 ('?', 637),
 ('me', 627),
 ('like', 624)]

In [11]:
dir(fdist1) #频率排名前五十

['B',
 'N',
 'Nr',
 '__add__',
 '__and__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__init__',
 '__ior__',
 '__isub__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__missing__',
 '__module__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_cumulative_frequencies',
 '_keep_positive',
 'clear',
 'copy',
 'elements',
 'freq',
 'fromkeys',
 'get',
 'hapaxes',
 'items',
 'keys',
 'max',
 'most_common',
 'pformat',
 'plot',
 'pop',
 'popitem',
 'pprint',
 'r_Nr',
 'setdefault',
 'subtract',
 'tabulate',
 'unicode_repr',
 'update',
 'values']

In [12]:
import matplotlib

In [13]:
fdist1.plot(50,cumulative=True) #这里会打出图像

### 词干提取

In [14]:
from nltk.stem import PorterStemmer #词干提取

In [15]:
stemmer=PorterStemmer()

In [16]:
stemmer.stem('cooking')

'cook'

In [18]:
stemmer.stem('cookery')

'cookeri'

In [19]:
stemmer.stem('antonym')

'antonym'

In [21]:
stemmer.stem('idealistic')

'idealist'

In [22]:
import nltk

In [23]:
dir(nltk)

['AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyGrammar',
 'DependencyGraph',
 'DependencyProduction',
 'DictionaryConditionalProbDist',
 'Dict

In [24]:
stemmer=nltk.PorterStemmer()
verbs=['ran','run','running','walk','walker','ride']
stems=[]
for verb in verbs:
    stemmed_verb=stemmer.stem(verb)
    stems.append(stemmed_verb)

In [25]:
sorted(set(stems))

['ran', 'ride', 'run', 'walk', 'walker']

In [26]:
from nltk.stem import PorterStemmer #两种导入方法都可以提取单个、多个词的词干
stemmer=PorterStemmer()
verbs=['ran','run','running','walk','walker','ride']
stems=[]
for verb in verbs:
    stemmed_verb=stemmer.stem(verb)
    stems.append(stemmed_verb)

In [27]:
sorted(set(stems))

['ran', 'ride', 'run', 'walk', 'walker']

### 单词变位原型

In [28]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
lemmatizer.lemmatize('cooking')

'cooking'

In [29]:
lemmatizer.lemmatize('cooking',pos='v')

'cook'

In [30]:
lemmatizer.lemmatize('cooks')

'cook'

In [40]:
from replacers import RegexpReplacer  #为什么！！！
replacer = RegexpReplacer()
replacer.replace("You can't do this, can you?")

AttributeError: 'RegexpReplacer' object has no attribute 'replace'

### accessing corpora

In [44]:
from nltk.corpus import gutenberg
for filename in gutenberg.fileids():
    r=gutenberg.raw(filename)
    w=gutenberg.words(filename)
    s=gutenberg.sents(filename)
    v=set(w)
    print (filename,len(r)/len(w),len(w)/len(s),len(w)/len(v))

austen-emma.txt 4.609909212324673 24.822884416924666 24.63538599411087
austen-persuasion.txt 4.749793727271801 26.19989324793168 16.00962165688193
austen-sense.txt 4.753785952421314 28.32086417283457 20.719449729255086
bible-kjv.txt 4.286881563819072 33.57319868451649 73.40068269300603
blake-poems.txt 4.567033756284415 19.073059360730593 4.59010989010989
bryant-stories.txt 4.489300433741879 19.40726510653161 12.57081447963801
burgess-busterbrown.txt 4.464641670621737 17.99146110056926 10.75
carroll-alice.txt 4.233216065669891 20.029359953024077 11.309681697612731
chesterton-ball.txt 4.716173862839705 20.296296296296298 10.841175813121717
chesterton-brown.txt 4.724783007796614 22.61245401996847 10.37028557657549
chesterton-thursday.txt 4.63099417739442 18.496258685195084 10.167915381225209
edgeworth-parents.txt 4.4391184023772565 20.59266862170088 21.960075054727405
melville-moby_dick.txt 4.76571875515204 25.928919375683467 13.502044830977896
milton-paradise.txt 4.835734572682675 52.309

In [45]:
from nltk.book import *

In [46]:
text1.concordance("monstrous")

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


### 外部文档操作

In [48]:
f=open('E:/大三下学期/高级编程（一）/第一周 20180306/corpara/1.txt')

In [49]:
f.read()

'The next financial crisis may be triggered by central banks.\n\nAs with London buses, don鈥檛 worry if you miss a financial crisis; another will be along shortly.\n\nThe latest study on long-term asset returns from Deutsche Bank shows that crises in developed markets have become much more common in recent decades.\n\nThat does not bode well.\n\nDeutsche defines a crisis as a period when a country suffers one of the following: a 15% annual decline in equities; a 10% fall in its currency or its government bonds; a default on its national debt; or a period of double-digit inflation.\n\nDuring the 19th century, only occasionally did more than half of countries for which there are data suffer such a shock in a single year.\n\nBut since the 1980s, in numerous years more than half of them have been in a financial crisis of some kind.\n\nThe main reason for this, argues Deutsche, is the monetary system.\n\nUnder the gold standard and its successor, the Bretton Woods system of fixed exchange rat

In [57]:
#建立自己的语料库
from nltk.corpus import PlaintextCorpusReader
corpus_root = "E:/大三下学期/高级编程（一）/第一周 20180306/corpara"
wordlists=PlaintextCorpusReader(corpus_root,'.*\.txt')
wordlists.fileids()

['1.txt', '2.txt', '3.txt', '4.txt', '5.txt']

In [60]:
#进行检索
n=nltk.word_tokenize(wordlists.raw(fileids="1.txt"))
complete1=nltk.Text(n)
complete1.concordance("data")

Displaying 1 of 1 matches:
alf of countries for which there are data suffer such a shock in a single year


In [62]:
#获取网络文本
from urllib.request import urlopen
url="https://www.cnblogs.com/ArrozZhu/p/8463882.html"
html=urlopen(url).read()
html[:60]

b'\r\n<!DOCTYPE html>\r\n<html lang="zh-cn">\r\n<head>\r\n<meta charse'

In [63]:
print (html)

b'\r\n<!DOCTYPE html>\r\n<html lang="zh-cn">\r\n<head>\r\n<meta charset="utf-8"/>\r\n<meta name="viewport" content="width=device-width, initial-scale=1" />\r\n<title>&lt;Python Text Processing with NLTK 2.0 Cookbook&gt;\xe4\xbb\xa3\xe7\xa0\x81\xe7\xac\x94\xe8\xae\xb0 - Arroz - \xe5\x8d\x9a\xe5\xae\xa2\xe5\x9b\xad</title>\r\n<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=-hy83QNg62d4qYibixJzxMJkbf1P9fTBlqv7SK5zVL01"/>\n<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/gray/bundle-gray.css?v=HL2WWD0LJEIKd_qaZrrRKuZpscDuS7PCV4RyAclJuYE1"/>\n<link id="mobile-style" media="only screen and (max-width: 767px)" type="text/css" rel="stylesheet" href="/skins/gray/bundle-gray-mobile.css?v=Owzi85UhDEb1CZicvxyFsKox3GEEaJ6PdwmXJEqO7dc1"/>\r\n<link title="RSS" type="application/rss+xml" rel="alternate" href="http://www.cnblogs.com/ArrozZhu/rss"/>\r\n<link title="RSD" type="application/rsd+xml" rel="EditURI" href="http://www.cnblogs.com/ArrozZhu/rsd.xml"/>\n<lin

In [67]:
#获取文本
import urllib.request
from bs4 import BeautifulSoup
url="https://www.cnblogs.com/ArrozZhu/p/8463882.html"

In [66]:

html=urllib.request.urlopen(url).read()
soup=BeautifulSoup(html)
print (soup.get_text())





<Python Text Processing with NLTK 2.0 Cookbook>代码笔记 - Arroz - 博客园







var currentBlogApp = 'ArrozZhu', cb_enable_mathjax=false;var isLogined=false;









Arroz













随笔- 109 
文章- 24 
评论- 0 





博客园  首页  新随笔  新文章  联系  管理  订阅 





<Python Text Processing with NLTK 2.0 Cookbook>代码笔记

如下是<Python Text Processing with NLTK 2.0 Cookbook>一书部分章节的代码笔记. 





Tokenizing text into sentences




>>> para = "Hello World. It's good to see you. Thanks for buying this book." 
>>> from nltk.tokenize import sent_tokenize 
>>> sent_tokenize(para) # "sent_tokenize"是一个函数，下文很多中间带下划线的标识符都指的是函数。
['Hello World.', "It's good to see you.", 'Thanks for buying this 
book.'] 




Tokenizing sentences into words




>>> from nltk.tokenize import word_tokenize 
>>> word_tokenize('Hello World.') 
['Hello', 'World', '.'] 
 
# 等同于
>>> from nltk.tokenize import TreebankWordTokenizer 
>>> tokenizer = TreebankWordTokenizer() 
>>> tokenizer.tokenize('Hello World.') 
['Hello', 'World', '.'] 
 
# 等同于
>>



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [68]:
raw=BeautifulSoup.get_text(soup)
from nltk.tokenize import word_tokenize
token=nltk.word_tokenize(raw)
token

['<',
 'Python',
 'Text',
 'Processing',
 'with',
 'NLTK',
 '2.0',
 'Cookbook',
 '>',
 '代码笔记',
 '-',
 'Arroz',
 '-',
 '博客园',
 'var',
 'currentBlogApp',
 '=',
 "'ArrozZhu",
 "'",
 ',',
 'cb_enable_mathjax=false',
 ';',
 'var',
 'isLogined=false',
 ';',
 'Arroz',
 '随笔-',
 '109',
 '文章-',
 '24',
 '评论-',
 '0',
 '博客园',
 '首页',
 '新随笔',
 '新文章',
 '联系',
 '管理',
 '订阅',
 '<',
 'Python',
 'Text',
 'Processing',
 'with',
 'NLTK',
 '2.0',
 'Cookbook',
 '>',
 '代码笔记',
 '如下是',
 '<',
 'Python',
 'Text',
 'Processing',
 'with',
 'NLTK',
 '2.0',
 'Cookbook',
 '>',
 '一书部分章节的代码笔记',
 '.',
 'Tokenizing',
 'text',
 'into',
 'sentences',
 '>',
 '>',
 '>',
 'para',
 '=',
 '``',
 'Hello',
 'World',
 '.',
 'It',
 "'s",
 'good',
 'to',
 'see',
 'you',
 '.',
 'Thanks',
 'for',
 'buying',
 'this',
 'book',
 '.',
 "''",
 '>',
 '>',
 '>',
 'from',
 'nltk.tokenize',
 'import',
 'sent_tokenize',
 '>',
 '>',
 '>',
 'sent_tokenize',
 '(',
 'para',
 ')',
 '#',
 '``',
 'sent_tokenize',
 "''",
 '是一个函数，下文很多中间带下划线的标识符都指的是函数。',
 '[

###  default tagging

In [73]:
import nltk

text=nltk.word_tokenize("We are going out.Just you and me.")
text


['We', 'are', 'going', 'out.Just', 'you', 'and', 'me', '.']

In [74]:
nltk.pos_tag(text)

[('We', 'PRP'),
 ('are', 'VBP'),
 ('going', 'VBG'),
 ('out.Just', 'IN'),
 ('you', 'PRP'),
 ('and', 'CC'),
 ('me', 'PRP'),
 ('.', '.')]

In [75]:
nltk.pos_tag(text)[0:4]

[('We', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('out.Just', 'IN')]