# Debug Links

Trying to debug why GoodReads links aren't working.

In [1]:
import pandas as pd
from bookdata.db import db_url

## Compare Book IDs

In [5]:
db_ids = pd.read_sql_table('book_ids', db_url(), schema='gr')
db_ids = db_ids.set_index('gr_book_id').rename(columns={'gr_isbn': 'gr_isbn10'})
db_ids.head()

Unnamed: 0_level_0,gr_book_rid,gr_work_id,gr_asin,gr_isbn10,gr_isbn13
gr_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5333265,1,5400751.0,,312853122.0,9780312853129.0
1333909,2,1323437.0,,743509986.0,9780743509985.0
7327624,3,8948723.0,B00071IKUY,,
6066819,4,6243154.0,,743294297.0,9780743294294.0
287140,5,278577.0,,850308712.0,9780850308716.0


In [3]:
pq_ids = pd.read_parquet('goodreads/gr-book-ids.parquet')
pq_ids = pq_ids.set_index('book_id')
pq_ids.head()

Unnamed: 0_level_0,work_id,isbn10,isbn13,asin
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5333265,5400751.0,312853122.0,9780312853129.0,
1333909,1323437.0,743509986.0,9780743509985.0,
7327624,8948723.0,,,B00071IKUY
6066819,6243154.0,743294297.0,9780743294294.0,
287140,278577.0,850308712.0,9780850308716.0,


In [6]:
both_ids = db_ids.join(pq_ids)
both_ids.head()

Unnamed: 0_level_0,gr_book_rid,gr_work_id,gr_asin,gr_isbn10,gr_isbn13,work_id,isbn10,isbn13,asin
gr_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5333265,1,5400751.0,,312853122.0,9780312853129.0,5400751.0,312853122.0,9780312853129.0,
1333909,2,1323437.0,,743509986.0,9780743509985.0,1323437.0,743509986.0,9780743509985.0,
7327624,3,8948723.0,B00071IKUY,,,8948723.0,,,B00071IKUY
6066819,4,6243154.0,,743294297.0,9780743294294.0,6243154.0,743294297.0,9780743294294.0,
287140,5,278577.0,,850308712.0,9780850308716.0,278577.0,850308712.0,9780850308716.0,


Look for different ISBNs

In [13]:
def isbn_diff(field, df=both_ids):
    mask = df[f'gr_{field}'].isnull() & df[field].isnull()
    mask |= df[f'gr_{field}'] == df[field]
    return df[~mask]

In [14]:
isbn_diff('isbn10')

Unnamed: 0_level_0,gr_book_rid,gr_work_id,gr_asin,gr_isbn10,gr_isbn13,work_id,isbn10,isbn13,asin
gr_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
24717704,1178,23589581.0,,B00F8LQRDU,,23589581.0,008,,
8013976,1704,12568197.0,,951230922x,,12568197.0,951230922X,,
28099039,3118,48103788.0,,B0006PGPH6,,48103788.0,00066,,
22743798,3378,42286486.0,,B00LMDSA92,,42286486.0,0092,,
23592581,3699,23909277.0,,B00B7TG2DC,,23909277.0,0072,,
...,...,...,...,...,...,...,...,...,...
8882157,2358548,485998.0,,ISBN043590,,485998.0,043590,,
25597821,2358916,42786177.0,,B00Y2I8DB4,,42786177.0,00284,,
15851621,2358951,21598945.0,,Published:,,21598945.0,,,
2059836,2359302,2065019.0,,B000HT2KQG,,2065019.0,0002,,


In [15]:
isbn_diff('isbn13')

Unnamed: 0_level_0,gr_book_rid,gr_work_id,gr_asin,gr_isbn10,gr_isbn13,work_id,isbn10,isbn13,asin
gr_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
16081442,201,21880573.0,,,B009N841IM,21880573.0,,009841,
17859771,1185,25002827.0,,,B00CJOCEPO,25002827.0,,00,
18622540,1286,26408939.0,,,B00FIPTFYO,26408939.0,,00,
7672457,1428,10288208.0,,9654824795,B007V62UAY,10288208.0,9654824795,00762,
22994886,2113,45759196.0,,,B00IRNKY8K,45759196.0,,008,
...,...,...,...,...,...,...,...,...,...
20518296,2358575,36374558.0,,9788415336,B00F58LO02,36374558.0,9788415336,005802,
18631369,2358880,26424641.0,,9780992029,B00FI1S9Y0,26424641.0,9780992029,00190,
13091693,2359744,4781036.0,,,B002BU5QYW,4781036.0,,0025,
9670901,2359761,478697.0,,,OT4110110001,478697.0,,4110110001,


In [16]:
isbn_diff('asin')

Unnamed: 0_level_0,gr_book_rid,gr_work_id,gr_asin,gr_isbn10,gr_isbn13,work_id,isbn10,isbn13,asin
gr_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
31346587,3069,50116650.0,RayJayPerr,,,50116650.0,,,RAYJAYPERR
13115749,3914,18289742.0,BNID:29400,,,18289742.0,,,BNID29400
25798264,9049,44673889.0,UnderACorn,,,44673889.0,,,UNDERACORN
11853906,26169,16810092.0,01988/5812,,,16810092.0,,,019885812
3140368,26722,3171815.0,LCCBT1375.,,,3171815.0,,,LCCBT1375
...,...,...,...,...,...,...,...,...,...
22838579,2317329,3652165.0,Simon&amp;Schu,,,3652165.0,,,SIMONAMPSCHU
29350161,2317396,49589581.0,ASIN:B0192,,,49589581.0,,,ASINB0192
32316418,2318956,45666384.0,/B01D27EX3,,,45666384.0,,,B01D27EX3
26152890,2348518,46109423.0,BO14B4YT9i,,,46109423.0,,,BO14B4YT9I


Conclusion: we don't have that many books with different ISBNs.

## Look at Book Clustering

Let's scan the book clustering table in the DB:

In [28]:
db_clusters = pd.read_sql_table('book_cluster', db_url(), schema='gr')
db_clusters = db_clusters.set_index('gr_book_id')
db_clusters.sort_index(inplace=True)
db_clusters

Unnamed: 0_level_0,cluster
gr_book_id,Unnamed: 1_level_1
1,167476
2,2044243
3,108089
4,108089
5,108089
...,...
36519806,6039792
36525806,5733872
36525824,5864550
36526448,5634754


And load from Parquet:

In [27]:
pq_links = pd.read_parquet('goodreads/gr-book-link.parquet').set_index('book_id')
pq_links.sort_index(inplace=True)
pq_links

Unnamed: 0_level_0,work_id,cluster
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,41335427.0,100594612
2,2809203.0,103432670
3,4640799.0,100620223
4,6231171.0,100620223
5,2402163.0,100620223
...,...,...
36525824,57882612.0,457882612
36526448,58250451.0,458250451
36529314,58253323.0,458253323
36529772,58253871.0,458253871


In [29]:
pq_links['ccode'] = pq_links['cluster'] // 100000000
pq_links[~pq_links['ccode'].isin([4, 5])]

Unnamed: 0_level_0,work_id,cluster,ccode
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,41335427.0,100594612,1
2,2809203.0,103432670,1
3,4640799.0,100620223,1
4,6231171.0,100620223,1
5,2402163.0,100620223,1
...,...,...,...
36498328,827903.0,103602696,1
36498955,45629111.0,108702505,1
36499229,1882486.0,100667266,1
36508486,1085189.0,108661658,1


We have fewer books resolving to a non-GR cluster, but not *that* many.

## Load Genders

Let's load the gender files.

In [47]:
db_genders = pd.read_sql_table('cluster_first_author_gender', db_url())
db_genders = db_genders.set_index('cluster')['gender'].astype('category')
db_genders.cat.rename_categories({'no-loc-author': 'no-book-author', 'no-viaf-author': 'no-author-rec'}, inplace=True)
db_genders.cat.add_categories('no-book', inplace=True)
db_genders.head()

cluster
10935495           female
4464609            female
10801730           female
756002             female
9076729     no-author-rec
Name: gender, dtype: category
Categories (7, object): ['ambiguous', 'female', 'male', 'no-book-author', 'no-author-rec', 'unknown', 'no-book']

In [48]:
pq_genders = pd.read_parquet('book-links/cluster-genders.parquet')
pq_genders = pq_genders.set_index('cluster')['gender'].astype('category')
pq_genders.head()

cluster
100109686         ambiguous
100027659         ambiguous
100025369         ambiguous
401752021    no-book-author
900000005           no-book
Name: gender, dtype: category
Categories (7, object): ['ambiguous', 'female', 'male', 'no-author-rec', 'no-book', 'no-book-author', 'unknown']

Now get genders:

In [49]:
db_gr_genders = db_clusters.join(db_genders, on='cluster', how='left')
db_gr_genders

Unnamed: 0_level_0,cluster,gender
gr_book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,167476,ambiguous
2,2044243,female
3,108089,ambiguous
4,108089,ambiguous
5,108089,ambiguous
...,...,...
36519806,6039792,no-book-author
36525806,5733872,no-book-author
36525824,5864550,no-book-author
36526448,5634754,no-book-author


In [50]:
pq_gr_genders = pq_links.join(pq_genders, on='cluster', how='left')
pq_gr_genders

Unnamed: 0_level_0,work_id,cluster,ccode,gender
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,41335427.0,100594612,1,ambiguous
2,2809203.0,103432670,1,female
3,4640799.0,100620223,1,ambiguous
4,6231171.0,100620223,1,ambiguous
5,2402163.0,100620223,1,ambiguous
...,...,...,...,...
36525824,57882612.0,457882612,4,no-book-author
36526448,58250451.0,458250451,4,no-book-author
36529314,58253323.0,458253323,4,no-book-author
36529772,58253871.0,458253871,4,no-book-author


In [51]:
genders = pq_gr_genders.join(db_gr_genders, lsuffix='_pq', rsuffix='_db')
genders

Unnamed: 0_level_0,work_id,cluster_pq,ccode,gender_pq,cluster_db,gender_db
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,41335427.0,100594612,1,ambiguous,167476.0,ambiguous
2,2809203.0,103432670,1,female,2044243.0,female
3,4640799.0,100620223,1,ambiguous,108089.0,ambiguous
4,6231171.0,100620223,1,ambiguous,108089.0,ambiguous
5,2402163.0,100620223,1,ambiguous,108089.0,ambiguous
...,...,...,...,...,...,...
36525824,57882612.0,457882612,4,no-book-author,5864550.0,no-book-author
36526448,58250451.0,458250451,4,no-book-author,5634754.0,no-book-author
36529314,58253323.0,458253323,4,no-book-author,,
36529772,58253871.0,458253871,4,no-book-author,5888071.0,no-book-author


In [52]:
genders[genders.eval('gender_db.notnull() & (gender_pq != gender_db)')]

Unnamed: 0_level_0,work_id,cluster_pq,ccode,gender_pq,cluster_db,gender_db
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
323,4641.0,109192851,1,ambiguous,3921144.0,female
374,4676.0,108011562,1,male,5572225.0,unknown
667,287946.0,100109686,1,ambiguous,2465344.0,female
685,1433491.0,101086930,1,ambiguous,1039465.0,male
912,3366043.0,100543430,1,ambiguous,4394172.0,male
...,...,...,...,...,...,...
36426210,27792495.0,100109686,1,ambiguous,6175929.0,no-book-author
36430330,27382963.0,100109686,1,ambiguous,1540752.0,female
36432318,41420870.0,100209221,1,ambiguous,6114528.0,no-book-author
36478574,28036282.0,101474645,1,female,6214199.0,no-book-author
