In [2]:
import nltk

from nltk.corpus import brown
from nltk.corpus import names
from nltk.corpus import inaugural
from nltk.parse import CoreNLPParser
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

### nltk.corpus提供了充足的文本作为语料库，将文本处理的基本单位划分为词语、句子和段落, 并对其中某些文本做了分类。

In [28]:
# 返回语料库中所有标注词性的单词
brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [31]:
# 返回语料库中某一篇文本的所有句子
inaugural.sents(inaugural.fileids()[0])

[['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':'], ['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ...]

In [33]:
# 返回语料库中某一篇文本的所有段落
brown.paras(brown.fileids()[1])

[[['Austin', ',', 'Texas'], ['--', 'Committee', 'approval', 'of', 'Gov.', 'Price', "Daniel's", '``', 'abandoned', 'property', "''", 'act', 'seemed', 'certain', 'Thursday', 'despite', 'the', 'adamant', 'protests', 'of', 'Texas', 'bankers', '.']], [['Daniel', 'personally', 'led', 'the', 'fight', 'for', 'the', 'measure', ',', 'which', 'he', 'had', 'watered', 'down', 'considerably', 'since', 'its', 'rejection', 'by', 'two', 'previous', 'Legislatures', ',', 'in', 'a', 'public', 'hearing', 'before', 'the', 'House', 'Committee', 'on', 'Revenue', 'and', 'Taxation', '.']], ...]

In [25]:
# 语料库中的分类
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [35]:
# 指定分类中的句子
brown.sents(categories=['science_fiction'])

[['Now', 'that', 'he', 'knew', 'himself', 'to', 'be', 'self', 'he', 'was', 'free', 'to', 'grok', 'ever', 'closer', 'to', 'his', 'brothers', ',', 'merge', 'without', 'let', '.'], ["Self's", 'integrity', 'was', 'and', 'is', 'and', 'ever', 'had', 'been', '.'], ...]

### nltk.tokenize 用于分词和分句

In [37]:
# 分句
para = '''He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish. In the first forty days a boy had been with him.
But after forty days without a fish the boy's parents had told him that the old man was 
now definitely and finally salao, which is the worst form of unlucky, and the boy had gone 
at their orders in another boat which caught three good fish the first week. '''

nltk.tokenize.sent_tokenize(para)

['He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.',
 'In the first forty days a boy had been with him.',
 "But after forty days without a fish the boy's parents had told him that the old man was \nnow definitely and finally salao, which is the worst form of unlucky, and the boy had gone \nat their orders in another boat which caught three good fish the first week."]

In [38]:
# 含语言参数的分词
sent = '''Notre Dame de Paris est ce que on doit chérir.'''
nltk.tokenize.word_tokenize(sent,language='french')

['Notre',
 'Dame',
 'de',
 'Paris',
 'est',
 'ce',
 'que',
 'on',
 'doit',
 'chérir',
 '.']

### nltk.tag用于词性标注、提供了Stanford的标注及命名实体识别工具。此外，子库中还提供了条件随机场、隐马尔可夫模型、感知机等机器学习方法，用于训练自建的词性标注模型。

In [58]:
# nltk.tag提供了对句子中的单词进行词性标注的api
nltk.tag.pos_tag(brown.words(categories=['adventure']),lang='eng')

[('Dan', 'NNP'),
 ('Morgan', 'NNP'),
 ('told', 'VBD'),
 ('himself', 'PRP'),
 ('he', 'PRP'),
 ('would', 'MD'),
 ('forget', 'VB'),
 ('Ann', 'NNP'),
 ('Turner', 'NNP'),
 ('.', '.'),
 ('He', 'PRP'),
 ('was', 'VBD'),
 ('well', 'RB'),
 ('rid', 'JJ'),
 ('of', 'IN'),
 ('her', 'PRP$'),
 ('.', '.'),
 ('He', 'PRP'),
 ('certainly', 'RB'),
 ("didn't", 'VBZ'),
 ('want', 'VB'),
 ('a', 'DT'),
 ('wife', 'NN'),
 ('who', 'WP'),
 ('was', 'VBD'),
 ('fickle', 'VBN'),
 ('as', 'IN'),
 ('Ann', 'NNP'),
 ('.', '.'),
 ('If', 'IN'),
 ('he', 'PRP'),
 ('had', 'VBD'),
 ('married', 'VBN'),
 ('her', 'PRP'),
 (',', ','),
 ("he'd", 'NN'),
 ('have', 'VBP'),
 ('been', 'VBN'),
 ('asking', 'VBG'),
 ('for', 'IN'),
 ('trouble', 'NN'),
 ('.', '.'),
 ('But', 'CC'),
 ('all', 'DT'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('was', 'VBD'),
 ('rationalization', 'NN'),
 ('.', '.'),
 ('Sometimes', 'RB'),
 ('he', 'PRP'),
 ('woke', 'VBD'),
 ('up', 'RP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('middle', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('night',

### nltk.classify用于文本分类任务。主要用于：根据文章的主题分类，多义词的语义分类，按作者对句子进行归类。classify库可以直接处理语料库中含有标记的文本，也可以自己根据需要构建标记类别。在子模块的api中，提供了使用SVM，朴素贝叶斯等方法训练模型的方法。


In [84]:
# 特征提取器：名字的最后一个字母
def gender_features(word):
    return {'last_letter':word[-1]}

# 导入数据集
names_Dataset=([(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])

# 处理数据集
featuresets=[(gender_features(n),g) for (n,g) in names_Dataset]

# 划分数据集
train_set,test_set=featuresets[800:],featuresets[:400]

# 朴素贝叶斯分类器
classfier=nltk.NaiveBayesClassifier.train(train_set)

In [85]:
# 分类器准确度
nltk.classify.accuracy(classfier,test_set)

0.4525

In [86]:
# 分类器提取到的最重要的特征
classfier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     35.5 : 1.0
             last_letter = 'k'              male : female =     30.9 : 1.0
             last_letter = 'f'              male : female =     18.1 : 1.0
             last_letter = 'p'              male : female =     15.3 : 1.0
             last_letter = 'v'              male : female =     13.5 : 1.0


参考资料：https://www.cnblogs.com/zhanghongfeng/p/8875478.html

### nltk.parse用于表示文本内部的树状结构，称为文本解析。通常情况下，文本解析是指将一个句子内部的句法结构表示为树的形式；此外，文本解析器有时候可以用于词汇，推导构成一个词的语素形态结构，或者用于篇章/段落，推导语篇的话语结构。nltk.parse.stanford提供了依存句法分析的模型。

In [7]:
# 创建parser
# parser = CoreNLPParser(url='http://localhost:9000')
# next(parser.raw_parse('The quick brown fox jumps over the lazy dog.')).pretty_print()  

### nltk.stem是用于去除单词中的词缀，只留下词干的接口。去除掉语法，时态，词性变换创造出的单词派生形态，起到词干提取的作用。然而有些时候基于规则的词干提取并不准确，例如ceil不是ceiling的词干，stem库还提供了lemmatizer方法来实现词形还原，确保生成的词语是真实存在的词。

In [4]:
ps = PorterStemmer()
ps.stem('ceiling')

'ceil'

### nltk.sentiment是nltk的情感分析库，生成的polarity_scores()方法中，compound表示复杂程度,neu表示中性，neg表示负面情绪，pos表示正面情绪。除了使用vader自带的情感分类器，还可以利用sentiment自带的库来添加标签，改进vader的性能。

In [5]:
wnl = WordNetLemmatizer()
wnl.lemmatize('running','v')

'run'

In [6]:
wnl.lemmatize('ceiling')

'ceiling'