# Data Preparation

## Download Goodreads Dataset

Download the UCSD Book Graph Dataset and save to the data directory.

### Dataset Information
* 2.36M books with meta-data
* 15M reviews

### Resource Link
[UCSD Book Graph](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home)


In [None]:
%pip install gdown
!mkdir data/goodreads
!gdown --fuzzy 'https://drive.google.com/uc?id=1LXpK1UfqtP89H1tYy0pBGHjYk8IhigUK' -O data/goodreads/
!wget https://drive.google.com/uc?id=19cdwyXwfXx_HDIgxXaHzH0mrx8nMyLvC -O data/goodreads/goodreads_book_authors.json.gz
!gdown --fuzzy 'https://drive.google.com/uc?id=1op8D4e5BaxU2JcPUgxM3ZqrodajryFBb' -O data/goodreads/
!gzip -d data/goodreads/goodreads_book_authors.json.gz
!gzip -d data/goodreads/goodreads_books.json.gz
!gzip -d data/goodreads/goodreads_book_series.json.gz

## 1. Reverse Index Data

Use book title, description, and book id to build the index.

In [9]:
import json
from tqdm import tqdm
import codecs

In [None]:
with codecs.open('data/goodreads/goodreads_books.json', 'r', encoding='utf-8') as fin:
    text = fin.readlines()
print(len(text))

In [None]:
with codecs.open('data/goodreads/goodreads_book_authors.json', 'r', encoding='utf-8') as fin:
    authors = fin.readlines()
print(len(authors))

In [None]:
authors[0]

In [None]:
authormap = dict()
for author_info in tqdm(authors):
    author_info = json.loads(author_info)
    authormap[int(author_info['author_id'])] = author_info['name']

In [None]:
with codecs.open('data/goodreads/goodreads_book_series.json', 'r', encoding='utf-8') as fin:
    series = fin.readlines()
print(len(series))

In [None]:
seriemap = dict()
for serie in tqdm(series):
    serie = json.loads(serie)
    seriemap[int(serie['series_id'])] = (serie['title'], serie['description'])

In [None]:
with codecs.open('data/goodreads/book_index_data.json', 'w', encoding='utf-8') as fout:
    for book in tqdm(text):
        meta = json.loads(book)
        book_info = dict()
        description = meta['description']
        book_info['book_id'] = int(meta['book_id'])
        book_info['title'] = meta['title']
        book_info['description'] = description
        author_list = []
        for author in meta['authors']:
            author_list.append(authormap[int(author['author_id'])])
        book_info['author_list'] = author_list
        fout.write(json.dumps(book_info, ensure_ascii=False) + '\n')

## 2. Database Data

In [None]:
with codecs.open('data/goodreads/book_database_data.json', 'w', encoding='utf-8') as fout:
    for book in tqdm(text):
        meta = json.loads(book)
        book_info = dict()
        description = meta['description']
        author_list = []
        for author in meta['authors']:
            author_list.append(authormap[int(author['author_id'])])
        meta['author_list'] = author_list
        series_list = []
        for series in meta['series']:
            series_list.append(seriemap[int(series)])
        meta['series_list'] = series_list
        fout.write(json.dumps(meta, ensure_ascii=False) + '\n')

## Prepare book data with reviews

### 1. Load book data

In [2]:
from index_server.cache_dict.cache_dict import CacheDict
import json
import codecs
from collections import deque

In [2]:
book_data = CacheDict(100000, 'data/goodreads/book_database_with_review/')

In [3]:
with codecs.open('data/goodreads/book_database_data.json', 'r', encoding='utf-8') as fin:
    num = 0
    errors = 0
    while True:
        num += 1
        if num % 100000 == 0: print(num)
        line = fin.readline()
        if not line:
            break
        try:
            doc = json.loads(line)
            book_id = doc['book_id']
            doc['comments'] = deque()
            book_data[book_id] = doc
        except:
            print(line)
            errors += 1

        

100000
200000
300000
400000
{"isbn": "8807173174", "text_reviews_count": "6", "series": [], "country_code": "US", "language_code": "ita", "popular_shelves": [{"count": "22", "name": "to-read"}, {"count": "2", "name": "currently-reading"}, {"count": "1", "name": "tentative-to-read-list"}, {"count": "1", "name": "sexualized-violence"}, {"count": "1", "name": "to-read-also-published-in-italian"}, {"count": "1", "name": "to-read-maybe"}], "asin": "", "is_ebook": "false", "average_rating": "3.78", "kindle_asin": "", "similar_books": [], "description": "Emiliano Fittipaldi e stato processato in Vaticano nel 2016 per un libro, \"Avarizia. Le carte che svelano ricchezza, scandali e segreti della Chiesa di Francesco\", di cui si e parlato in tutto il mondo. Nel 2017 torna con una nuova inchiesta. \"Da qualche tempo sto scartabellando nuovi documenti riservati, intercettazioni delle procure italiane e straniere, atti di commissioni internazionali. Sto incontrando preti e monsignori che mi raccon

In [11]:
errors

6

In [14]:
with codecs.open('data/goodreads/goodreads_reviews_dedup.json', 'r', encoding='utf-8') as fin:
    num = 0
    errors = 0
    while True:
        num += 1
        if num % 100000 == 0: print(num)
        line = fin.readline()
        if not line:
            break
        try:
            review = json.loads(line)
            book_id = review['book_id']
            user_id = review['user_id']
            review_text = review['review_text']
            score = review['rating']
            
            book_data[book_id]['comments'].append((user_id, review_text, score))
        except:
            print(line)
            errors += 1
    

{"user_id": "e9651bbea324fe1c77cd0756d7ff1370", "book_id": "27259867", "review_id": "e36faa8cc46f0381e09af833dece3d77", "rating": 5, "review_text": "Cormac or \"Mac\" as he likes to be called is set and happy with the way his life is. He is successful in his business and the way everything is going in his life. \n Andi is having a rough time - her dad beat her and she ended up in the hospital. She meets Stormy Harrison and he is gonna help her. When she meets Mac and he tells her she is his mate she tries to run because what she has heard about mates scares her to death and she wants nothing to do with it. \n Can Mac convince her that she has it all wrong and she stay with him? This is a definite must read new series by Kathi S. Barton and would recommend to everyone.", "date_added": "Mon Feb 08 05:09:59 -0800 2016", "date_updated": "Sun Jul 31 20:53:32 -0700 2016", "read_at": "Sun Feb 07 00:00:00 -0800 2016", "started_at": "Thu Feb 04 00:00:00 -0800 2016", "n_votes": 0, "n_comments": 

In [3]:
with codecs.open('data/goodreads/goodreads_reviews_dedup.json', 'r', encoding='utf-8') as fin:
    num = 0
    errors = 0
    while True:
        num += 1
        if num % 10000 == 0: print(num)
        line = fin.readline()
        print(line)
        break

{"user_id": "8842281e1d1347389f2ab93d60773d4d", "book_id": "24375664", "review_id": "5cd416f3efc3f944fce4ce2db2290d5e", "rating": 5, "review_text": "Mind blowingly cool. Best science fiction I've read in some time. I just loved all the descriptions of the society of the future - how they lived in trees, the notion of owning property or even getting married was gone. How every surface was a screen. \n The undulations of how society responds to the Trisolaran threat seem surprising to me. Maybe its more the Chinese perspective, but I wouldn't have thought the ETO would exist in book 1, and I wouldn't have thought people would get so over-confident in our primitive fleet's chances given you have to think that with superior science they would have weapons - and defenses - that would just be as rifles to arrows once were. \n But the moment when Luo Ji won as a wallfacer was just too cool. I may have actually done a fist pump. Though by the way, if the Dark Forest theory is right - and I see

In [9]:
max(ids)

36530431

In [10]:
min(ids)

1

### 2. Generate the file for index building

In [2]:
import json
import codecs

In [2]:
total = 0
with codecs.open('data2/data/book_database_with_review.json', 'r') as fin, codecs.open('data2/data/book_index_with_review.json', 'w') as fout:
    while True:
        line = fin.readline()
        total += 1
        if not line:
            break
        if total % 100000 == 0:
            print(total)
        doc = json.loads(line)
        o_doc = dict()
        o_doc['book_id'] = int(doc['book_id'])
        o_doc['title'] = doc['title']
        o_doc['description'] = doc['description']
        o_doc['author_list'] = doc['author_list']
        reviews = list(map(lambda x: x[1], doc['comments']))
        o_doc['comments'] = reviews
        o = json.dumps(o_doc, ensure_ascii=False)
        fout.write(o + '\n')

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000


In [5]:
line

''

In [8]:
samples = 0
with codecs.open('data2/data/book_database_with_review.json', 'r') as fin, codecs.open('data2/data/book_index_with_review_samples.json', 'w') as fout:
    while True:
        line = fin.readline()
        samples += 1
        if not line:
            break
        if samples % 100000 == 0 and samples > 0:
            break
        doc = json.loads(line)
        o_doc = dict()
        o_doc['book_id'] = int(doc['book_id'])
        o_doc['title'] = doc['title']
        o_doc['description'] = doc['description']
        o_doc['author_list'] = doc['author_list']
        reviews = list(map(lambda x: x[1], doc['comments']))
        o_doc['comments'] = reviews
        o = json.dumps(o_doc, ensure_ascii=False)
        fout.write(o + '\n')

### 3. Prepare bookid2score mapping

In [1]:
import json
import codecs

In [4]:
mapping = dict()
total = 0
errors = 0
with codecs.open('data2/data/book_database_with_review.json', 'r') as fin, codecs.open('data2/data/book_id2score_mapping.txt', 'w') as fout:
    while True:
        line = fin.readline()
        total += 1
        if not line:
            break
        if total % 100000 == 0:
            print(total)
        doc = json.loads(line)
        bookid = doc['book_id']
        score = doc['average_rating']
        try:
            score_value = float(score)
        except:
            print(bookid, score)
            score = '0.0'
            errors += 1
        fout.write(' '.join([bookid, score]) + '\n')

23699819 
2597774 
18521522 
28253116 
17796597 
17837584 
25232185 
25763989 
31950676 
6715476 
25751414 
36324095 
23591342 
32078957 
18682291 
31128368 
23247803 
31754188 
18982051 
100000
35623864 
18683818 
18683811 
18458333 
30309361 
17565405 
18297314 
18688848 
23690565 
18284915 
36402694 
28116322 
12400154 
18398266 
34037641 
25382888 
18665565 
23783294 
36348666 
200000
36355252 
36408306 
17997211 
36285213 
22728283 
17786900 
18683854 
28431756 
29558474 
25222333 
22059000 
32721297 
18217981 
18488895 
30067868 
23920475 
22873397 
24990927 
20813106 
17796601 
11157038 
34115997 
34193593 
21474080 
18746353 
300000
18753867 
35422475 
1473309 
22738842 
20661873 
23451913 
36258875 
33804656 
35795730 
16687061 
15724994 
32050496 
26040038 
13604715 
23151596 
29539380 
34812406 
17838351 
17974554 
24033758 
35534345 
23015177 
18301045 
18301047 
18301042 
400000
36435770 
27846601 
25358484 
36136268 
35531955 
18685432 
23593961 
32933014 
6566555 
259657

In [3]:
print(errors)

524
