# Office Title Extraction
Ding Zhang

This ipynb file extracts office titles from the given source file (original text) and the target file (translation file)

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import opencc
import os

In [2]:
directory = []
for dic in os.listdir('../ngram/'):
    if dic != '.ipynb_checkpoints':
        directory.append(dic)
directory

['N-gram-beiqi',
 'N-gram-beishi',
 'N-gram-wei',
 'N-gram-yuan',
 'N-gram-nanshi',
 'N-gram-xinwudai',
 'N-gram-nanqi',
 'N-gram-shiji',
 'N-gram-houhan',
 'N-gram-zhoushu',
 'N-gram-taiping',
 'N-gram-songshi',
 'N-gram-xuxia',
 'N-gram-xintang',
 'N-gram-jiuwudai',
 'N-gram-jiutang',
 'N-gram-ming',
 'N-gram-jin',
 'N-gram-liang',
 'N-gram-han',
 'N-gram-zizhi',
 'N-gram-liao',
 'N-gram-jinshi',
 'N-gram-chen',
 'N-gram-suishu',
 'N-gram-shuijing']

In [3]:
source = 'shuijing'

In [4]:
# Read in N-grams
ngram = []
n10 = pd.read_csv('../ngram/N-gram-%s/n_10.csv' % source).rename(columns={'原文':'source', '翻译':'target'})
n9 = pd.read_csv('../ngram/N-gram-%s/n_9.csv' % source).rename(columns={'原文':'source', '翻译':'target'})
n8 = pd.read_csv('../ngram/N-gram-%s/n_8.csv' % source).rename(columns={'原文':'source', '翻译':'target'})
n7 = pd.read_csv('../ngram/N-gram-%s/n_7.csv' % source).rename(columns={'原文':'source', '翻译':'target'})
n6 = pd.read_csv('../ngram/N-gram-%s/n_6.csv' % source).rename(columns={'原文':'source', '翻译':'target'})
n5 = pd.read_csv('../ngram/N-gram-%s/n_5.csv' % source).rename(columns={'原文':'source', '翻译':'target'})
ngram.append(n10)
ngram.append(n9)
ngram.append(n8)
ngram.append(n7)
ngram.append(n6)
ngram.append(n5)

In [5]:
# Filter out rows that are 0 
for i in range(len(ngram)):
    ngram[i] = ngram[i][(ngram[i]['source'] != 0)]
    ngram[i] = ngram[i][(ngram[i]['target'] != 0)]

In [6]:
# Read in list of numbers
f = open('../assets/numbers.txt', 'r')
num_list = f.readlines()
for i in range(len(num_list)):
    num_list[i] = num_list[i].strip()

In [7]:
# Add to dictionary
office_title = {}
for i in range(len(ngram)):
    for index, row in ngram[i].iterrows():
        office_title[row['N-gram']] = row['source']

In [8]:
titles = list(office_title.keys())
freq = list(office_title.values())

In [9]:
# Extract strings with at least two numbers (meaning the string is a year index not an office title)
for i in range(len(titles)):
    count = 0
    for num in num_list:
        if num in titles[i]:
            count += 1
    if count > 1:
        titles[i] = ''
        freq[i] = ''

# Extract empty strings
titles = [item for item in titles if item != '']
freq = [f for f in freq if f != '']
len(titles)

3863

In [10]:
# Brute force to increase accuracy
for i in range(len(titles)):
    if titles[i][-1] == "人":
        titles[i] = ''
        freq[i] = ''

titles = [item for item in titles if item != '']
freq = [f for f in freq if f != '']
len(titles) == len(freq)

True

In [11]:
len(titles)

3846

In [12]:
# Keep an copy of the original titles and frequencies for sanity check
f = open('../test_files/orgtitle_freq.txt', 'w')
for i in range(len(titles)):
    f.write(titles[i] + ":" + str(freq[i]) + "\n")
f.close()

In [13]:
# merge similar duo-grams into a larger one   同中书门下平，中书门下平章 ==> 同中书门下平章事
def merge_duogram(title):
    for i in tqdm(range(len(title))):
        if title[i] != '':
            pivot = 1
            for j in range(i+1, len(title)):
                if (title[j][0] == title[i][pivot]) and (title[j][-2] == title[i][-1]):
                    title[i] += title[j][-1]
                    title[j] = ''
                    freq[j] = ''
                    pivot += 1
                else:
                    i += j
                    pivot = 0
                    break

In [14]:
merge_duogram(titles)

100%|██████████| 3846/3846 [00:00<00:00, 1629094.44it/s]


In [15]:
# drop all empty strings
titles = [item for item in titles if item != '']
freq = [f for f in freq if f != '']
len(titles) == len(freq)

True

In [16]:
len(titles)

2434

In [17]:
len(titles) == len(freq)

True

In [18]:
# sanity check
s1 = []
s2 = []
s3 = []
s4 = []
s5 = []

In [19]:
# Apply the algorithm
# for k in range(4):
for i in tqdm(range(len(titles) - 1)):
    if titles[i] != '':
        for j in range(i+1, len(titles)):
            if titles[j] != '':
                fratio = freq[i] / freq[j]
                if fratio == 1:
                    # must be substring
                    if len(titles[i]) > len(titles[j]):
                        if titles[j] in titles[i] and (titles[i].find(titles[j])) == 0:
                            s1.append((titles[j], titles[i])) 
                            titles[j] = ''
                            freq[j] = ''
                            
for i in tqdm(range(len(titles) - 1)):
    if titles[i] != '':
        for j in range(i+1, len(titles)):
            if titles[j] != '':
                fratio = freq[i] / freq[j]
                if fratio < 1:
                    if len(titles[i]) > len(titles[j]):
                        if (titles[j] in titles[i]): 
                            if (titles[i].find(titles[j])) != 0: #南转运判官，淮南转运判官，湖南转运判官
                                if (freq[j] - freq[i]) <= 1.5*freq[i]:
                                    s2.append((titles[j], titles[i]))
                                    titles[j] = ''
                                    freq[j] = ''
                                    break
                            else:
                                if (freq[j] - freq[i]) <= 1.5*freq[i]:
                                    s3.append((titles[j], titles[i]))
                                    titles[j] = ''
                                    freq[j] = ''
                                    break
                                else:
                                    s4.append((titles[i], titles[j]))
                                    titles[i] = ''
                                    freq[i] = ''
                                    break

100%|██████████| 2433/2433 [00:01<00:00, 2130.59it/s]
100%|██████████| 2433/2433 [00:00<00:00, 4081.41it/s]


In [20]:
titles = [item for item in titles if item != '']
freq = [f for f in freq if f != '']
len(titles) == len(freq)

True

In [21]:
len(titles)

2346

In [22]:
# Extract words with frequency only 1, mostly likely meaningless string came from merging duograms
for i in range(len(freq)):
    if freq[i] == 1:
        freq[i] = ''
        titles[i] = ''
titles = [item for item in titles if item != '']
freq = [f for f in freq if f != '']
len(titles) == len(freq)

True

In [23]:
len(titles)

136

In [24]:
# sanity check
freq_extract = {key : value for key, value in zip(titles, freq)} 
f = open('../test_files/freq_extract.txt', 'w')
for key in freq_extract:
    f.write(key + ":" + str(freq_extract[key]) + "\n")
f.close()

In [25]:
# sanity check
f = open('../test_files/s1.txt', 'w')
for i in range(len(s1)):
    f.write(str(s1[i]))
    f.write('\n')
f.close()

f = open('../test_files/s2.txt', 'w')
for i in range(len(s2)):
    f.write(str(s2[i]))
    f.write('\n')
f.close()

f = open('../test_files/s3.txt', 'w')
for i in range(len(s3)):
    f.write(str(s3[i]))
    f.write('\n')
f.close()

f = open('../test_files/s4.txt', 'w')
for i in range(len(s4)):
    f.write(str(s4[i]))
    f.write('\n')
f.close()

f = open('../test_files/s5.txt', 'w')
for i in range(len(s5)):
    f.write(str(s5[i]))
    f.write('\n')
f.close()

len(titles)

136

In [26]:
# brute force to increase accuracy
for i in range(len(titles)):
    if "帝" in titles[i]:
        titles[i] = ''

for i in range(len(titles)):
    for num in num_list:
        if titles[i] != '' and titles[i][-1] == num:
            titles[i] = ''
            break
    
titles = [item for item in titles if item != '']
len(titles)

103

In [27]:
# Remerge substrings to meaningful larger string: 宁军节度使 ==> 广宁军节度使 ...
merged = []
for i in range(len(titles)-1):
    for j in range(1, len(titles)):
        if (titles[i] != '' and titles[j] != ''):
            if (titles[i] in titles[j]):
                if len(titles[j]) - len(titles[i]) == 1:
                    merged.append((titles[i], titles[j]))
                    titles[i] = ''
            elif (titles[j] in titles[i]):
                if (len(titles[i]) - len(titles[j])) == 1:
                    merged.append((titles[j], titles[i]))
                    titles[j] = ''
titles = [item for item in titles if item != '']

len(titles)

100

In [28]:
# Brute force to increase accuracy : if the title comes with a verb, get rid of that verb
verb_list = ['号', '赠', '进', '封', '领', '除', '置', '提举', '令', '命', '请', '议', '试', '迁', '诏', '赐', '加', '率', '遣', '时']
verb_extract = []
for i in range(len(titles)):
    for v in verb_list:
        if titles[i][0:len(v)] == v:
            verb_extract.append(titles[i])
            titles[i] = titles[i][len(v):]

In [29]:
titles = list(set(titles))

# sort the words starting with the same character

for i in range(len(titles)-1):
    cur = i+1
    for j in range(i+1, len(titles)):
        if titles[j][0] == titles[i][0]:
            replaced = titles[cur]
            titles[cur] = titles[j]
            titles[j] = replaced
            cur += 1
    i = cur           

titles = sorted(titles, reverse=True)

len(titles)

84

In [30]:
f = open('../test_files/sorted.txt', 'w')
for title in titles:
    f.write(title + "\n")
f.close()

f = open('../test_files/verb.txt', 'w')
for title in verb_extract:
    f.write(title + "\n")
f.close()

f = open('../test_files/merged.txt', 'w')
for item in merged:
    f.write(str(item) + "\n")
f.close()

In [32]:
# The list contains lots of n=4 useful office titles. 
# Therefore, set threshold to be 4 representing 4 grams. 
n4gram = dict()
for i in range(len(titles)):
    if len(titles[i]) >= 4:
        n4 = titles[i][0:4]
        if n4 not in n4gram:
            n4gram[n4] = 1
        else:
            n4gram[n4] += 1

n4gram = {key:val for key, val in n4gram.items() if val >= 8}
n4gram

{}

In [33]:
for i in range(len(titles)):
    for key in n4gram:
        if titles[i][0:4] == key:
            titles[i] = key

In [34]:
titles = list(set(titles))

# sort same few words

for i in range(len(titles)-1):
    cur = i+1
    for j in range(i+1, len(titles)):
        if titles[j][0] == titles[i][0]:
            replaced = titles[cur]
            titles[cur] = titles[j]
            titles[j] = replaced
            cur += 1
    i = cur           

titles = sorted(titles, reverse=True)

len(titles)

84

In [35]:
# Saving file
f = open('../titles_extracted/%s_titles.txt' % source, 'w')
for title in titles:
    f.write(title + "\n")
f.close()

In [36]:
address = pd.read_csv('../cbdb_named_entities/cbdb_entity_addresses.csv')

In [37]:
address.sample(3)

Unnamed: 0,addr_chn
10102,灤縣
3486,砷農架林區
6607,臨山衛


In [38]:
offices = pd.read_csv('../cbdb_named_entities/cbdb_entity_offices.csv')

In [39]:
offices.sample(3)

Unnamed: 0,office_name,dynasty_index
26628,貢院搜檢,20.0
5630,遙郡節度觀察留後,15.0
29465,詳定官,15.0


In [40]:
# Convert simplified chinese to traditional chinese
for i in tqdm(range(len(titles))):
    converter = opencc.OpenCC('s2t.json')
    titles[i] = converter.convert(titles[i])  

100%|██████████| 84/84 [00:01<00:00, 48.03it/s]


In [41]:
titles[0]

'黃水北流注'

In [42]:
df = pd.DataFrame(titles)

In [43]:
df['faddress_first2'] = 0       # first two words of the title match the first two words of sample address
df['laddress_first2'] = 0       # last two words of the title match the first two words of sample address
df['faddress_last2'] = 0         
df['laddress_last2'] = 0
df['foffice_first2'] = 0
df['loffice_first2'] = 0
df['foffice_last2'] = 0
df['loffice_last2'] = 0

In [44]:
# Compare with CBDB based on first or last 2 words in order to get rid of office titles that contain names at the end of the strings
for i in tqdm(range(len(titles))):
    for j in address.index:
        # tries to match the first two words of the title 
        if titles[i][:2] == address['addr_chn'][j][:2]:      
            df['faddress_first2'][i] = 1
            break
        if titles[i][-2:] == address['addr_chn'][j][:2]:
            df['laddress_first2'][i] = 1
            break
        if titles[i][:2] == address['addr_chn'][j][-2:]:
            df['faddress_last2'][i] = 1
            break
        if titles[i][-2:] == address['addr_chn'][j][-2:]:
            df['laddress_last2'][i] = 1
            break

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['faddress_first2'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['laddress_first2'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['faddress_last2'][i] = 1
100%|██████████| 84/84 [00:15<00:00,  5.27it/s]


In [45]:
for i in tqdm(range(len(titles))):
    for j in offices.index:
        if titles[i][:2] == offices['office_name'][j][:2]:
            df['foffice_first2'][i] = 1
            break
        if titles[i][-2:] == offices['office_name'][j][:2]:
            df['loffice_first2'][i] = 1
            break
        if titles[i][:2] == offices['office_name'][j][-2:]:
            df['foffice_last2'][i] = 1
            break
        if titles[i][-2:] == offices['office_name'][j][-2:]:
            df['loffice_last2'][i] = 1
            break

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['loffice_first2'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['foffice_first2'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['foffice_last2'][i] = 1
100%|██████████| 84/84 [00:35<00:00,  2.38it/s]


In [46]:
df.sample(3)

Unnamed: 0,0,faddress_first2,laddress_first2,faddress_last2,laddress_last2,foffice_first2,loffice_first2,foffice_last2,loffice_last2
38,沁水縣西北,0,1,0,0,0,1,0,0
6,雍州刺史諸葛緒,1,0,0,0,0,0,0,0
74,仙人王子喬,0,0,0,0,0,0,0,0


In [47]:
df.to_csv('../test_files/compare_office_original.csv')

In [48]:
df = df[(df.sum(axis=1) != 0)]

  df = df[(df.sum(axis=1) != 0)]


In [49]:
df = df[(df['laddress_last2'] != 0) | (df['loffice_last2'] != 0)]

In [50]:
df.to_csv('../test_files/compare_office_nonzero.csv')

In [51]:
title_CBDB = pd.read_csv('../test_files/compare_office_nonzero.csv')

In [52]:
title_CBDB

Unnamed: 0.1,Unnamed: 0,0,faddress_first2,laddress_first2,faddress_last2,laddress_last2,foffice_first2,loffice_first2,foffice_last2,loffice_last2


In [53]:
titles = title_CBDB.iloc[:,1].tolist()

In [54]:
# Saving file
f = open('../titles_extracted/%s_titles_cbdb.txt' % source, 'w')
for title in titles:
    f.write(title + "\n")
f.close()