In [1]:
# import openai
# from pdfminer.high_level import extract_text
import re

# import collections
import pdfplumber

# TARGET

### 1. 抽取命名实体

- 用 GPT3.5 抽取
- 用 spaCy 抽取 (**prefer**)
- 自定义规则抽取

### 2. 抽取关系

1. 参考知识图谱书本的结构，关系分为`目录`, `前置`, `句子共现`, `段落共现`, `频繁项集`  
   以上关系**自定义规则**抽取即可
2. 问题在于，以上关系仅考虑位置，未考虑**语义信息**  
   需要对关系列表进行扩张，考虑实体与实体间的语义信息，如{`'LSTM'`, `'isA'`, `'RNN'`}  
   需要先观察抽取出的命名实体，再进一步计划需要扩张哪些关系

### 3. 实现知识图谱

- 将数据储存为 json 格式，用 java 制作知识图谱 (pending to explore)

# Work In Progress

1. 制作**目录架构**，包括**名称与序号**的对应关系，以及**序号与页码**的对应关系，可以用来抽取`目录`和`前置`关系
2. 制作**内容架构**，包括**段落表**和**句子表**，段落表目前基于页数划分，一页视为一个段落，句子表目前基于'`.`'划分，一个句号是一个句子
3. 用 GPT3.5 尝试抽取命名实体

# TO-DO List

1. 探索 spaCy package，查看抽取出的实体的质量
2. 自定义规则，实现`目录`, `前置`, `句子共现`, `段落共现`, `频繁项集`该 5 个关系
3. 扩张关系列表，考虑语义信息
4. 探索 java 生成多层级知识图谱的方式

# Question

1. 段落与句子的划分怎么样才能更加细致，目前段落的问题是太长，句子的问题是句号'`.`'会出现在人名/公式/...奇怪的地方，并不仅出现在句尾
1. 新增的、考虑语义信息的关系应该如何抽取
1. 如何设计 json 的架构，以更加方便地用 java 制作知识图谱
1. etc...


# 设置 api


In [2]:
openai.api_base = "https://api.chatanywhere.com.cn/v1"
# openai.api_base = 'https://api.chatanywhere.cn/v1'
openai.api_key = "sk-D1u13WweY1LhWLqv95Ml7e3y8f8ToSfsTkGnlgvSQLqZJptC"

# 读取数据


In [3]:
# 不选择用这种方式读取，用pdfplumber更好
# text = extract_text('./Deep Learning (Ian Goodfellow, Yoshua Bengio, Aaron Courville).pdf')

## 生成目录架构

- 生成**章节名称**与**章节序号**的对应：`name_dict`
- 以及**章节序号**与**章节页码**范围的对应：`index_dict`


In [2]:
with pdfplumber.open("../data/Deep Learning.pdf") as f:
    # 目录架构生成
    c, p, n = [], [], []
    for i in range(7):
        page = f.pages[i]
        text = page.extract_text()
        text_split = text.split("\n")
        for i in text_split:
            if bool(re.match("[0-9]+\.[0-9]+", i.split(" ")[0])):
                c.append(i.split(" ")[0])
                p.append(i.split(" ")[-1])
            if bool(re.match("[0-9]+", i.split(" ")[0])):
                for j in i.split(" "):
                    if bool(re.match("[A-Za-z]+", j)):
                        n.append((i.split(" ")[0], j))

### name_dict


In [3]:
name_dict = {}
for i, j in n:
    if i in name_dict:
        name_dict[i] = name_dict.get(i, "") + " " + j
    else:
        name_dict[i] = name_dict.get(i, "") + j

In [6]:
c, p, n

(['1.1',
  '1.2',
  '2.1',
  '2.2',
  '2.3',
  '2.4',
  '2.5',
  '2.6',
  '2.7',
  '2.8',
  '2.9',
  '2.10',
  '2.11',
  '2.12',
  '3.1',
  '3.2',
  '3.3',
  '3.4',
  '3.5',
  '3.6',
  '3.7',
  '3.8',
  '3.9',
  '3.10',
  '3.11',
  '3.12',
  '3.13',
  '3.14',
  '4.1',
  '4.2',
  '4.3',
  '4.4',
  '4.5',
  '5.1',
  '5.2',
  '5.3',
  '5.4',
  '5.5',
  '5.6',
  '5.7',
  '5.8',
  '5.9',
  '5.10',
  '5.11',
  '6.1',
  '6.2',
  '6.3',
  '6.4',
  '6.5',
  '6.6',
  '7.1',
  '7.2',
  '7.3',
  '7.4',
  '7.5',
  '7.6',
  '7.7',
  '7.8',
  '7.9',
  '7.10',
  '7.11',
  '7.12',
  '7.13',
  '7.14',
  '8.1',
  '8.2',
  '8.3',
  '8.4',
  '8.5',
  '8.6',
  '8.7',
  '9.1',
  '9.2',
  '9.3',
  '9.4',
  '9.5',
  '9.6',
  '9.7',
  '9.8',
  '9.9',
  '9.10',
  '9.11',
  '10.1',
  '10.2',
  '10.3',
  '10.4',
  '10.5',
  '10.6',
  '10.7',
  '10.8',
  '10.9',
  '10.10',
  '10.11',
  '10.12',
  '11.1',
  '11.2',
  '11.3',
  '11.4',
  '11.5',
  '11.6',
  '12.1',
  '12.2',
  '12.3',
  '12.4',
  '12.5',
  '13.1',
  

### index_dict


In [88]:
p_range = list(zip(p, p[1:]))
p_range.append((720, 800))
c_p_range = list(zip(c, p_range))
index_dict = collections.defaultdict(list)
for k, v in c_p_range:
    index_dict[k.split(".")[0]].append((k, v))

In [139]:
index_dict["2"]

[('2.1', ('31', '34')),
 ('2.2', ('34', '36')),
 ('2.3', ('36', '37')),
 ('2.4', ('37', '39')),
 ('2.5', ('39', '40')),
 ('2.6', ('40', '42')),
 ('2.7', ('42', '44')),
 ('2.8', ('44', '45')),
 ('2.9', ('45', '46')),
 ('2.10', ('46', '47')),
 ('2.11', ('47', '48')),
 ('2.12', ('48', '54'))]

## 生成内容表

- 段落内容表：`content_dict`
- 句子内容表：`sentence_dict`


In [118]:
with pdfplumber.open(
    "./Deep Learning (Ian Goodfellow, Yoshua Bengio, Aaron Courville).pdf"
) as f:
    content_dict = collections.defaultdict(list)

    sentence_dict = collections.defaultdict(list)

    for k, v in index_dict.items():
        for i in v:
            page_range = i[-1]

            for j in range(int(page_range[0]), int(page_range[1])):
                page = f.pages[j]

                text = page.extract_text().replace("\n", " ")

                content_dict[i[0]].append(text)

                text_split = text.split(".")

                for t in text_split:
                    sentence_dict[i[0]].append(t)


        break

In [127]:
content_dict["1.1"][2]

'CONTENTS Chapter 15, Representation Learning: Kunal Ghosh. • Chapter 16, Structured Probabilistic Models for Deep Learning: Minh Lê • and Anton Varfolom. Chapter 18, Confronting the Partition Function: Sam Bowman. • Chapter 19, Approximate Inference: Yujia Bao. • Chapter 20, Deep Generative Models: Nicolas Chapados, Daniel Galvez, • Wenming Ma, Fady Medhat, Shakir Mohamed and Grégoire Montavon. Bibliography: Lukas Michelbacher and Leslie N. Smith. • We also want to thank those who allowed us to reproduce images, figures or data from their publications. We indicate their contributions in the figure captions throughout the text. We would like to thank Lu Wang for writing pdf2htmlEX, which we used to make the web version of the book, and for offering support to improve the quality of the resulting HTML. We would like to thank Ian’s wife Daniela Flori Goodfellow for patiently supporting Ian during the writing of the book as well as for help with proofreading. We would like to thank the Go

In [141]:
sentence_dict["1.1"][0]

'Acknowledgments This book would not have been possible without the contributions of many people'

## 一些其他尝试

N - Gram


In [142]:
# 暂时没有探索结果
# all_text = re.sub('[^A-Za-z0-9\.]+', ' ', text).lower().split(' ')
# ng1 = collections.defaultdict(int)
# ng2 = collections.defaultdict(int)
# ng3 = collections.defaultdict(int)
# ng4 = collections.defaultdict(int)
# for i, j in enumerate(all_text):
#     ng1[j] += 1
#     if i > 0: ng2[(all_text[i-1], j)] += 1
#     if i > 1: ng3[(all_text[i-2], all_text[i-1], j)] += 1
#     if i > 2: ng4[(all_text[i-3], all_text[i-2], all_text[i-1], j)] += 1

In [9]:
def sort_feq(dic):
    return sorted([(k, v) for k, v in dic.items()], key=lambda x: x[1], reverse=True)

# 定义 Chat 类


In [6]:
def total_counts(response):
    tokens_nums = int(response["usage"]["total_tokens"])
    price = 0.002 / 1000
    cost = "{:.5f}".format(price * tokens_nums * 7.5)
    print(f"tokens: {tokens_nums}, cost: {cost}")

    return float(cost)

In [151]:
class Chat:
    def __init__(self, conversation_list=[]):
        self.conversation_list = conversation_list
        self.costs_list = []

    def show_conversation(self, msg_list):
        for msg in msg_list[-2:]:
            if msg["role"] == "user":
                pass
            else:
                message = msg["content"]
                print(f"\U0001f47D: {message}\n")
            print()

    def ask(self, prompt):
        self.conversation_list.append({"role": "user", "content": prompt})
        openai.api_key = "sk-D1u13WweY1LhWLqv95Ml7e3y8f8ToSfsTkGnlgvSQLqZJptC"
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", messages=self.conversation_list
        )
        answer = response.choices[0].message["content"]

        self.conversation_list.append({"role": "assistant", "content": answer})
        self.show_conversation(self.conversation_list)

        cost = total_counts(response)
        self.costs_list.append(cost)
        return answer
        print()

# 测试


In [10]:
relations = ["目录", "前置", "句子共现", "段落共现", "频繁项集"]  # pending

In [11]:
conversation_list = [
    {
        "role": "system",
        "content": "请只提取文本中的命名实体，格式为[{entity_A}, {entity_B}, ...], 不要返回任何其他内容",
    }
]

In [152]:
bot = Chat(conversation_list)
answer = bot.ask(content_dict["1.1"][1])


👽: [{"Charlie Gorichanaz"}, {"Brendan Loudermilk"}, {"Eric Morris"}, {"Cosmin Pârvulescu"}, {"Alfredo Solano"}, {"Chapter 2"}, {"Amjad Almahairi"}, {"Nikola Banić"}, {"Kevin Bennett"}, {"Philippe Castonguay"}, {"Oscar Chang"}, {"Eric Fosler-Lussier"}, {"Andrey Khalyavin"}, {"Sergey Oreshkov"}, {"István Petrás"}, {"Dennis Prangle"}, {"Thomas Rohée"}, {"Gitanjali Gulve Sehgal"}, {"Colby Toland"}, {"Alessandro Vitale"}, {"Bob Welland"}, {"Chapter 3"}, {"John Philip Anderson"}, {"Kai Arulkumaran"}, {"Vincent Dumoulin"}, {"Rui Fa"}, {"Stephan Gouws"}, {"Artem Oboturov"}, {"Antti Rasmus"}, {"Alexey Surkov"}, {"Volker Tresp"}, {"Chapter 4"}, {"Tran Lam AnIan Fischer"}, {"Hu Yuhuang"}, {"Chapter 5"}, {"Dzmitry Bahdanau"}, {"Justin Domingue"}, {"Nikhil Garg"}, {"Makoto Otsuka"}, {"Bob Pepin"}, {"Philip Popien"}, {"Emmanuel Rayner"}, {"Peter Shepard"}, {"Kee-Bong Song"}, {"Zheng Sun"}, {"Andy Wu"}, {"Chapter 6"}, {"Uriel Berdugo"}, {"Fabrizio Bottarel"}, {"Elizabeth Burl"}, {"Ishan Durugkar"}, 

### 测试结果

可以看到效果还不错，但是 GPT3.5 有一些比较致命的问题，它抽出的命名实体甚至有可能都**不在原文中**


In [158]:
res_tiny = [i.strip() for i in re.sub('"|}|{||\[|\]', "", answer).split(",")]
res_tiny

['Charlie Gorichanaz',
 'Brendan Loudermilk',
 'Eric Morris',
 'Cosmin Pârvulescu',
 'Alfredo Solano',
 'Chapter 2',
 'Amjad Almahairi',
 'Nikola Banić',
 'Kevin Bennett',
 'Philippe Castonguay',
 'Oscar Chang',
 'Eric Fosler-Lussier',
 'Andrey Khalyavin',
 'Sergey Oreshkov',
 'István Petrás',
 'Dennis Prangle',
 'Thomas Rohée',
 'Gitanjali Gulve Sehgal',
 'Colby Toland',
 'Alessandro Vitale',
 'Bob Welland',
 'Chapter 3',
 'John Philip Anderson',
 'Kai Arulkumaran',
 'Vincent Dumoulin',
 'Rui Fa',
 'Stephan Gouws',
 'Artem Oboturov',
 'Antti Rasmus',
 'Alexey Surkov',
 'Volker Tresp',
 'Chapter 4',
 'Tran Lam AnIan Fischer',
 'Hu Yuhuang',
 'Chapter 5',
 'Dzmitry Bahdanau',
 'Justin Domingue',
 'Nikhil Garg',
 'Makoto Otsuka',
 'Bob Pepin',
 'Philip Popien',
 'Emmanuel Rayner',
 'Peter Shepard',
 'Kee-Bong Song',
 'Zheng Sun',
 'Andy Wu',
 'Chapter 6',
 'Uriel Berdugo',
 'Fabrizio Bottarel',
 'Elizabeth Burl',
 'Ishan Durugkar',
 'Jeff Hlywa',
 'Jong Wook Kim',
 'David Krueger',
 'Adi

In [157]:
# 一些基于N-gram的探索，会输出命名实体在文章中出现的次数，没啥用
# temp = []
# for i in res_tiny:
#     ng = len(i.split(' '))
#     if ng == 1:
#         temp.append((i, ng1[i]))
#     elif ng == 2:
#         temp.append((i, ng2[tuple(i.split(' '))]))
#     elif ng == 3:
#         temp.append((i, ng3[tuple(i.split(' '))]))
#     elif ng == 4:
#         temp.append((i, ng4[tuple(i.split(' '))]))
#     else:
#         temp.append((i, 0))
# entity_frq_dic = dict(temp)
# entity_frq = sort_feq(entity_frq_dic)

In [144]:
# 尝试批量抽取，一次抽取1000个词，失败，请求数过多
# i = 0
# span = 1000
# res = []
# while i * span < len(all_text):
#     if (i + 1) * span > len(all_text):
#         prompt = all_text[i * span:]
#     else:
#         prompt = all_text[i * span : (i + 1) * span]
#     conversation_list = [
#         {'role': 'system', 'content':'请只提取文本中的命名实体，格式为[{entity_A}, {entity_B}, ...], 不要返回任何其他内容'},
#         {"role": "user", "content": ' '.join(prompt)}]
#     response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=conversation_list)
#     tmp = response.choices[0].message['content']
#     res += [i.strip() for i in re.sub('\"|}|{||\[|\]', '', tmp).split(',')]