In [2]:
import os
import json
import pickle
import numpy as np
from tqdm import tqdm
from collections import defaultdict

## Read file

In [3]:
# read dblp_whole_data
data = pickle.load(open('books_filtered.pkl','rb'))

In [3]:
data[0]

{'similar_books': ['8709549',
  '17074050',
  '28937',
  '158816',
  '228563',
  '11296581',
  '1073987',
  '7298465',
  '1274862',
  '18290554',
  '86382',
  '343067',
  '2774907',
  '663892',
  '1233981',
  '298912',
  '401091',
  '307575'],
 'description': 'Anita Diamant\'s international bestseller "The Red Tent" brilliantly re-created the ancient world of womanhood. Diamant brings her remarkable storytelling skills to "Good Harbor" -- offering insight to the precarious balance of marriage and career, motherhood and friendship in the world of modern women. The seaside town of Gloucester, Massachusetts is a place where the smell of the ocean lingers in the air and the rocky coast glistens in the Atlantic sunshine. When longtime Gloucester-resident Kathleen Levine is diagnosed with breast cancer, her life is thrown into turmoil. Frightened and burdened by secrets, she meets Joyce Tabachnik -- a freelance writer with literary aspirations -- and a once-in-a-lifetime friendship is born. 

In [4]:
def remove_next_line(text):
    t = ' '.join(text.strip().split('\n'))
    
    return ' '.join(t.split('\t'))

In [5]:
# collect all genres

genres_set = set()

for d in tqdm(data):
    for g in d['genres']:
        genres_set.add(g)
        
print(len(genres_set))
print(genres_set)

100%|███████████████████████████████████████████████████████████████████████████| 1098617/1098617 [00:05<00:00, 195608.06it/s]

10
{'children', 'fiction', 'poetry', 'young-adult', 'history, historical fiction, biography', 'fantasy, paranormal', 'non-fiction', 'mystery, thriller, crime', 'comics, graphic', 'romance'}





In [6]:
# generate title_fos_dict: key title, value fos_list

title_fos_dict = {}
cnt = 0

for p in tqdm(data):
    book_text = remove_next_line(p['title']+p['description'])
    if book_text in title_fos_dict:
        cnt += 1
    if 'genres' in p:
        title_fos_dict[book_text] = p['genres']
    else:
        title_fos_dict[book_text] = []
    
print(cnt)
print(len(title_fos_dict))

100%|███████████████████████████████████████████████████████████████████████████| 1098617/1098617 [00:07<00:00, 140382.65it/s]

159010
939607





In [7]:
title_fos_dict

{'Good HarborAnita Diamant\'s international bestseller "The Red Tent" brilliantly re-created the ancient world of womanhood. Diamant brings her remarkable storytelling skills to "Good Harbor" -- offering insight to the precarious balance of marriage and career, motherhood and friendship in the world of modern women. The seaside town of Gloucester, Massachusetts is a place where the smell of the ocean lingers in the air and the rocky coast glistens in the Atlantic sunshine. When longtime Gloucester-resident Kathleen Levine is diagnosed with breast cancer, her life is thrown into turmoil. Frightened and burdened by secrets, she meets Joyce Tabachnik -- a freelance writer with literary aspirations -- and a once-in-a-lifetime friendship is born. Joyce has just bought a small house in Gloucester, where she hopes to write as well as vacation with her family. Like Kathleen, Joyce is at a fragile place in her life. A mutual love for books, humor, and the beauty of the natural world brings the 

## generation from link prediction train set

In [8]:
## obtain fos_list for the query in each line

# read train data
train_texts = []
with open('book_text/train_text.tsv') as f:
    read_data = f.readlines()
    for line in tqdm(read_data):
        query, _ = line.strip().split('\$\$')
        query = query.split('\t')
        assert len(query) == 16
        train_texts.append(query[0])

100%|███████████████████████████████████████████████████████████| 768172/768172 [00:16<00:00, 46404.89it/s]


In [10]:
# generate fos_list for texts_list

fos_list = []
fos_cnt_dict = defaultdict(int)
#cnt = 0

for text in tqdm(train_texts):
    #if text not in title_fos_dict:
    #    cnt += 1
    #    continue
    
    fos_list.append(title_fos_dict[text])
    for f in title_fos_dict[text]:
        fos_cnt_dict[f] += 1

print(len(fos_cnt_dict))
#print(cnt)

100%|██████████████████████████████████████████████████████████| 768172/768172 [00:03<00:00, 211979.39it/s]

10





In [11]:
len(fos_list)

768172

In [12]:
# generate fos & cnt pair
fos_cnt_list = []

for f in tqdm(fos_cnt_dict):
    fos_cnt_list.append((f,fos_cnt_dict[f]))

fos_cnt_list = sorted(fos_cnt_list, key=lambda x:-x[1])
print(fos_cnt_list)

100%|██████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 134003.32it/s]

[('fiction', 617895), ('romance', 314878), ('history, historical fiction, biography', 307926), ('mystery, thriller, crime', 269127), ('fantasy, paranormal', 262237), ('young-adult', 197843), ('non-fiction', 192150), ('children', 98487), ('comics, graphic', 66731), ('poetry', 32513)]





In [14]:
# select topk for as classification types

type_list = []
k = 10

for i in range(k):
    type_list.append(fos_cnt_list[i][0])

print(type_list)

['fiction', 'romance', 'history, historical fiction, biography', 'mystery, thriller, crime', 'fantasy, paranormal', 'young-adult', 'non-fiction', 'children', 'comics, graphic', 'poetry']


In [15]:
# see how many paper do not have any fos

cnt = 0

labels = []

for fos in tqdm(fos_list):
    tmp_fos = [f for f in fos]
    if len(set(tmp_fos) & set(type_list)) == 0:
        cnt += 1
        
print(f'{cnt} paper do not have label')

100%|██████████████████████████████████████████████████████████| 768172/768172 [00:01<00:00, 413368.27it/s]

7015 paper do not have label





In [16]:
# generate label

labels = []

for fos in tqdm(fos_list):
    tmp_label = []
    tmp_fos = set([f for f in fos])
    for i in range(len(type_list)):
        if type_list[i] in tmp_fos:
            tmp_label.append(1)
        else:
            tmp_label.append(0)
    labels.append(tmp_label)
    assert len(tmp_label) == len(type_list)
assert len(labels) == len(fos_list)

print(len(labels))

np.save('/shared/data2/bowenj4/textnet/HeterformerClassifier/book_embed/transductive_label.npy', np.array(labels))

100%|███████████████████████████████████████████████████████████| 768172/768172 [00:29<00:00, 26430.21it/s]


768172


In [17]:
# save type_list

with open('/shared/data2/bowenj4/textnet/HeterformerClassifier/book_embed/class_name.txt','w') as fout:
    for i, t in enumerate(type_list):
        fout.write(str(i)+'\t'+t+'\n')

## generation from link prediction test set

In [18]:
## obtain fos_list for the query in each line

# read test data
test_texts = []
with open('book_text/test_text.tsv') as f:
    read_data = f.readlines()
    for line in tqdm(read_data):
        query, _ = line.strip().split('\$\$')
        query = query.split('\t')
        assert len(query) == 16
        test_texts.append(query[0])

100%|███████████████████████████████████████████████████████████| 219502/219502 [00:06<00:00, 31860.78it/s]


In [19]:
# generate fos_list for texts_list

fos_list = []
fos_cnt_dict = defaultdict(int)

for text in tqdm(test_texts):
    fos_list.append(title_fos_dict[text])
    for f in title_fos_dict[text]:
        fos_cnt_dict[f] += 1

print(len(fos_cnt_dict))

100%|██████████████████████████████████████████████████████████| 219502/219502 [00:01<00:00, 128162.00it/s]

10





In [20]:
# generate fos & cnt pair
fos_cnt_list = []

for f in tqdm(fos_cnt_dict):
    fos_cnt_list.append((f,fos_cnt_dict[f]))

fos_cnt_list = sorted(fos_cnt_list, key=lambda x:-x[1])
print(fos_cnt_list)

100%|██████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 108379.95it/s]

[('fiction', 176506), ('romance', 89484), ('history, historical fiction, biography', 88251), ('mystery, thriller, crime', 76974), ('fantasy, paranormal', 75269), ('young-adult', 56462), ('non-fiction', 54886), ('children', 27782), ('comics, graphic', 18961), ('poetry', 9341)]





In [21]:
# see how many paper do not have any fos

cnt = 0

labels = []

for fos in tqdm(fos_list):
    tmp_fos = [f for f in fos]
    if len(set(tmp_fos) & set(type_list)) == 0:
        cnt += 1
        
print(f'{cnt} paper do not have label')

100%|██████████████████████████████████████████████████████████| 219502/219502 [00:00<00:00, 285778.18it/s]

2036 paper do not have label





In [22]:
# generate label

labels = []

for fos in tqdm(fos_list):
    tmp_label = []
    tmp_fos = set([f for f in fos])
    for i in range(len(type_list)):
        if type_list[i] in tmp_fos:
            tmp_label.append(1)
        else:
            tmp_label.append(0)
    labels.append(tmp_label)
    assert len(tmp_label) == len(type_list)
assert len(labels) == len(fos_list)

np.save('/shared/data2/bowenj4/textnet/HeterformerClassifier/book_embed/inductive_label.npy', np.array(labels))

100%|███████████████████████████████████████████████████████████| 219502/219502 [00:02<00:00, 95099.93it/s]


## save papers (train & test set together)

In [23]:
print(len(train_texts), len(test_texts))

768172 219502


In [24]:
# save train_texts concate with test_texts
with open('/shared/data2/bowenj4/textnet/HeterformerClassifier/book_embed/train_test_text.tsv','w') as fout:
    for train_text in tqdm(train_texts):
        fout.write(train_text+'\n')
    for test_text in tqdm(test_texts):
        fout.write(test_text+'\n')

100%|██████████████████████████████████████████████████████████| 768172/768172 [00:02<00:00, 289038.89it/s]
100%|██████████████████████████████████████████████████████████| 219502/219502 [00:00<00:00, 468947.61it/s]


## Generate data for author classification

In [7]:
train_authors_id2idx = pickle.load(open('neighbour/author_id2idx_dict.pkl','rb'))
author_neighbours = pickle.load(open('neighbour/author_neighbour.pkl','rb'))

In [8]:
train_authors_idx2id = {train_authors_id2idx[a]:a for a in tqdm(train_authors_id2idx)}

100%|█████████████████████████████████████████████████████████████████████████████| 205891/205891 [00:00<00:00, 901525.47it/s]


In [9]:
print(len(train_authors_idx2id))

205891


In [10]:
type_list = ['fiction', 'romance', 'history, historical fiction, biography', 'mystery, thriller, crime', 'fantasy, paranormal', 'young-adult', 'non-fiction', 'children', 'comics, graphic', 'poetry']

In [12]:
## construct author fos dict
author_fos_dict = {} # key: author, value: fos count dict

for a in tqdm(author_neighbours):
    assert a not in author_fos_dict
    author_fos_dict[a] = {f:0 for f in type_list}
    for p in author_neighbours[a]:
        if p not in title_fos_dict:
            cnt += 1
            continue
        for fos in title_fos_dict[p]:
            if fos in type_list:
                author_fos_dict[a][fos] += 1
                
print(cnt)

100%|██████████████████████████████████████████████████████████████████████████████| 205891/205891 [00:05<00:00, 35217.76it/s]

159015





In [14]:
author_fos_dict

{'5293': {'fiction': 267,
  'romance': 0,
  'history, historical fiction, biography': 22,
  'mystery, thriller, crime': 267,
  'fantasy, paranormal': 2,
  'young-adult': 0,
  'non-fiction': 0,
  'children': 0,
  'comics, graphic': 0,
  'poetry': 0},
 '7010240': {'fiction': 1,
  'romance': 11,
  'history, historical fiction, biography': 0,
  'mystery, thriller, crime': 2,
  'fantasy, paranormal': 1,
  'young-adult': 2,
  'non-fiction': 0,
  'children': 2,
  'comics, graphic': 0,
  'poetry': 0},
 '7816': {'fiction': 82,
  'romance': 13,
  'history, historical fiction, biography': 48,
  'mystery, thriller, crime': 1,
  'fantasy, paranormal': 1,
  'young-adult': 10,
  'non-fiction': 69,
  'children': 0,
  'comics, graphic': 3,
  'poetry': 77},
 '3388560': {'fiction': 1,
  'romance': 1,
  'history, historical fiction, biography': 0,
  'mystery, thriller, crime': 0,
  'fantasy, paranormal': 0,
  'young-adult': 0,
  'non-fiction': 0,
  'children': 0,
  'comics, graphic': 0,
  'poetry': 1},
 '

In [21]:
# generate label

labels = []

for aidx in tqdm(range(len(train_authors_idx2id))):
    aid = train_authors_idx2id[aidx]
    fos = author_fos_dict[aid]
    fos_sum = 0
    
    tmp_label = []
    for i in range(len(type_list)):
        if fos[type_list[i]] >= 1:
            tmp_label.append(1)
        else:
            tmp_label.append(0)
    labels.append(tmp_label)
    assert len(tmp_label) == len(type_list)
assert len(labels) == len(train_authors_idx2id)

np.save('/shared/data2/bowenj4/textnet/HeterformerClassifier/book_embed/author_label.npy', np.array(labels))

100%|█████████████████████████████████████████████████████████████████████████████| 205891/205891 [00:01<00:00, 125960.52it/s]


In [22]:
labels

[[0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
 [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [1, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 1, 0, 1, 0, 0, 0, 0],
 [0, 0, 1, 1, 0, 0, 1, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
 [1, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [1, 1, 0, 1, 1, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
 [1, 0, 1, 1, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 1, 1, 1, 0, 1, 0, 0],
 [1, 0, 1, 1, 1, 1, 0, 1, 1, 0],
 [1, 1, 1, 1, 1, 1, 1, 0, 1, 0],
 [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [1, 1, 0, 1, 1, 1, 0, 0, 0, 0],
 [0, 1, 0,