# 文本数据处理

**清洗 → tokenize → vocab → corpus**

## 1. 读取并清洗文本

In [1]:
import collections
import re

file_path = 'novel.txt'

def read_txt_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    cleaned_lines = [
        re.sub('[^A-Za-z]+', ' ', line).strip().lower()
        for line in lines
    ]
    return cleaned_lines

lines = read_txt_file(file_path)
print(len(lines))
print(lines[0])

6779
the project gutenberg ebook of the great gatsby


## 2. Tokenize

In [2]:
def tokenize(lines, token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        raise ValueError(token)

tokens = tokenize(lines, 'word')
print(tokens[0])

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'great', 'gatsby']


## 3. 构建词表 Vocab

In [6]:
import collections

def count_corpus(corpus):
    """
    统计语料中每个 token 出现的次数
    tokens:
        - 可以是 ['a', 'b', 'c']
        - 也可以是 [['a','b'], ['c','d']]
    返回：
        Counter({'a': 3, 'b': 2, ...})
    """
    all_tokens = []
    for line in corpus:          # 一行一行取
      for token in line.split():       # 行里一个个 token 取
          all_tokens.append(token)


    return collections.Counter(all_tokens)

class Vocab:
    def __init__(self, tokens=None):
        """
        构建词表
        tokens: token 列表（可以是一维或二维）
        """
        if tokens is None:
            tokens = []

        # 1. 统计词频
        counter = count_corpus(tokens)

        # 2. 初始化特殊符号
        self.idx_to_token = ['<unk>', '<bos>', '<eos>']
        self.token_to_idx = {
            '<unk>': 0,
            '<bos>': 1,
            '<eos>': 2
        }

        # 3. 按频率从高到低加入普通 token
        for token, freq in counter.most_common():
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
      return len(self.idx_to_token)

    def __getitem__(self, tokens):
      # 单个 token
      if not isinstance(tokens, (list, tuple)):
          return self.token_to_idx.get(tokens, self.token_to_idx['<unk>'])

      # token 列表
      indices = []
      for token in tokens:
          indices.append(self[token])
      return indices


    def print_vocab(self, n=10):
      print("===== Vocabulary Preview =====")
      print("index -> token")
      for i in range(min(n, len(self.idx_to_token))):
          print(f"{i:>3} -> {self.idx_to_token[i]}")


## 4. 构建字符级 corpus

In [8]:
vocab = Vocab(lines)

vocab.print_vocab(n=100)

print("now ->", vocab['now'])
print("unknown ->", vocab['xyz'])
print("sentence ->", vocab[['<bos>', 'interest', 'man', '<eos>']])

===== Vocabulary Preview =====
index -> token
  0 -> <unk>
  1 -> <bos>
  2 -> <eos>
  3 -> the
  4 -> and
  5 -> a
  6 -> i
  7 -> of
  8 -> to
  9 -> in
 10 -> he
 11 -> was
 12 -> that
 13 -> it
 14 -> you
 15 -> his
 16 -> s
 17 -> with
 18 -> at
 19 -> t
 20 -> she
 21 -> her
 22 -> had
 23 -> on
 24 -> for
 25 -> me
 26 -> as
 27 -> him
 28 -> gatsby
 29 -> but
 30 -> from
 31 -> my
 32 -> we
 33 -> all
 34 -> said
 35 -> there
 36 -> out
 37 -> this
 38 -> up
 39 -> an
 40 -> tom
 41 -> daisy
 42 -> or
 43 -> were
 44 -> they
 45 -> if
 46 -> into
 47 -> about
 48 -> one
 49 -> by
 50 -> when
 51 -> what
 52 -> have
 53 -> then
 54 -> over
 55 -> be
 56 -> so
 57 -> is
 58 -> like
 59 -> down
 60 -> who
 61 -> man
 62 -> no
 63 -> back
 64 -> came
 65 -> been
 66 -> any
 67 -> d
 68 -> some
 69 -> do
 70 -> just
 71 -> little
 72 -> not
 73 -> now
 74 -> know
 75 -> gutenberg
 76 -> don
 77 -> house
 78 -> before
 79 -> went
 80 -> project
 81 -> after
 82 -> eyes
 83 -> old
 84