In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Date    : Sep-13-21 21:11
# @Author  : Kan HUANG

import os
import csv
import codecs
from utils.file_utils import print_lines, load_lines, load_conversations, extract_sentence_pairs

### 查看原始文本数据

In [2]:
corpus_name = "cornell movie-dialogs corpus"
corpus_path = os.path.join("data", corpus_name)
movie_lines_path = os.path.join(corpus_path, "movie_lines.txt") # 原始文件路径
print_lines(movie_lines_path)


b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


### 查看使用 file_utils 中的函数处理好文本后的结果

In [3]:
# Define path to new file
datafile_path = os.path.join(corpus_path, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict, conversations list, and field ids
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID",
                    "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID",
                            "movieID", "utteranceIDs"]

# Load lines and process conversations
print("\nProcessing corpus...")
lines = load_lines(movie_lines_path, MOVIE_LINES_FIELDS)
print("\nLoading conversations...")
conversations = load_conversations(os.path.join(corpus_path, "movie_conversations.txt"), lines, MOVIE_CONVERSATIONS_FIELDS)

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile_path, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter,
                        lineterminator='\n')
    for pair in extract_sentence_pairs(conversations):
        writer.writerow(pair)

# Print a sample of lines
print("\nSample lines from file:")
print_lines(datafile_path, n=10)



Processing corpus...

Loading conversations...

Writing newly formatted file...

Sample lines from file:
b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister. 

### 解释

`file_utils.py` 模块中的函数比较复杂，我们直接使用。最终我们需要的是提取出来的对话文本对，它存放在`datafile_path`变量定义的路径文件中，即`formatted_movie_lines.txt`，我们可以在`data/cornell movie-dialogs corpus`中找到它。

`formatted_movie_lines.txt`文件的每一行是一个对话文本对，两句话由制表符`\t`做间隔（delimiter），后面的读取数据也会用到这个间隔符。