In [None]:
'''
    自然语言的应用:
    文本分类(Text Classification)、信息检索(Information Retrieval)、文字校对(Text Proofing)、自然语言生成(Natural Language Generation)、问答系统(Question Answering)、机器翻译(Machine Translation)、自动摘要(Audomatic Summarization)、
    情绪分析(Sentiment Analysis)、语音识别(Speech Recognition)
'''
'''
    词袋(Bag of Words,BOW):
        1. 分词(Tokenization): 将文章中每个词汇切开,整理成生字表或者字典(Vocabulary)。中文比较复杂.
        2. 前置处理(Preprocessing): 将词汇做词形还原,转换成小写。词形还原是动词转为原形,复数转单数,避免词态不同,词汇统计出现分歧。
        3. 去除停用词(Stop Word): be动词、助动词、代名词、介词、系动词、冠词等。否测统计结构都是这些词出现频率最高
        4. 词汇出现次数统计: 计算每个词汇在文章出现的次数,并由高到低排列。
'''

In [6]:
'''
    以BOW实现自动摘要
'''
import collections

# 1.设置停顿词
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d','most','the','of','in','to']

maxlen = 100000

word_freqs = collections.Counter()
with open("./asset/attetion.txt",'r+',encoding='utf-8') as f:
    for line in f:
      # 转小写,分词
        words = line.lower().split(' ')
        #统计字数出现频次
        if len(words) > maxlen:
            maxlen = len(words)
        for word in words:
            if not (word in stop_words):
                word_freqs[word] += 1

print(word_freqs.most_common(20))

[('input', 9), ('sequence', 8), ('recurrent', 6), ('attention', 6), ('neural', 5), ('modeling', 5), ('models', 5), ('model', 5), ('on', 5), ('representations', 5), ('transduction', 4), ('computation', 4), ('positions', 4), ('this', 4), ('output', 4), ('memory', 3), ('state', 3), ('such', 3), ('language', 3), ('sequential', 3)]


In [5]:
'''
    TF-IDF,非停顿词有时候也会高频出现,例如only等。需要降低这些对上下文无效词的比重
'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

# 1.设定输入数据,最后依据为问题,其他的为回答
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

# 2.将例句转换为词频矩阵,计算各个词汇出现的次数
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

## 生字表
word = vectorizer.get_feature_names_out()
print("Vocabulary: ",word)

# 4.查看四句话的BOW
print("BOW: ",X.toarray())

# 5.TF-IDF转换:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
print("TF-IDF: ",tfidf.toarray())

Vocabulary:  ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
BOW:  [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
TF-IDF:  [[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]
