In [43]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [44]:
df = pd.read_csv("data/GoodReads_100k_books.csv")

### _Understand the schema_
This involves looking at the number of columns and what columns we have. We then determine what columns we'll actually need right now, what can be kept for later, and which ones are not going to be needed at all.

In [53]:
len(df)
# len = 100000

100000

In [54]:
df.head()

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [75]:
df.columns
# mandatory - title, author, desc, genre 
# deduplication - isbn, book_id (might have to create this or get this from link column)
# optional metadata - pages, link,
# not needed (for now) - bookformat, img, isbn3, rating, reviews, totalratings

Index(['author', 'bookformat', 'desc', 'genre', 'img', 'isbn', 'isbn13',
       'link', 'pages', 'rating', 'reviews', 'title', 'totalratings'],
      dtype='object')

### _Check for missing or incomplete values_
This involves looking at individual columns and checking if there are any missing values or values like "Unknown" or blanks. We focus mainly on the columns we will be needing mandatorily and then go to the optional columns.

In [77]:
# How many books have missing titles?

df[df["title"].isna()]
# count 1

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
54953,Jacqui Malpass,,,"Diary,Journaling",,,,https://goodreads.com/book/show/13552877-n-a,0,3.33,3,,9


In [None]:
# How many books have no authors or author is blank or Unknown?

len(df[df["author"] == "Unknown"])
# count 15
df[df["author"].str.len() <= 7]
# TODO more anomalies might be present but keeping them for later

In [72]:
# How many books have missing descriptions?

len(df[df["desc"].isna()]) / len(df) * 100
# count 6772 i.e. 6.77%

6.772

In [70]:
# How many books have missing genres?

len(df[df["genre"].isna()]) / len(df) * 100
# count 10467 i.e. 10.47%

10.467

In [None]:
# How many books have 0 pages?

df[df["pages"] == 0]
# count 7752

In [None]:
# How many books have no ISBN code?

len(df[df["isbn"].isna()]) / len(df) * 100
# count 14482 i.e. 14.48%

In [None]:
# How many books have no link?

df[ (df["link"].isna()) | (df["link"] == "Unknown") | (df["link"] == "")]
# None baabbyy

### _Look for anomalies or inconsistencies_
These will be little things that can mess up the embeddings. Descriptions might be the most important field so we will heavily focus on that.

_Description field_ - we will need to check whether there are any blank descriptions, or whether descriptions contain HTML tags or regex expressions or escape characters, or if we have too short descriptions. Too short descriptions may of be no use when generating embeddings.

_Genre field_ - we will need to check how chaotic they are; while they are currently comma-separated in one column, we'll need to see if this follows through consistently across all rows or if we have nay anomalies here. These could cause problems when creating embeddings or trying to detect a genre.

_Duplicates_ - we will need to check if there are duplicate rows or whether there are any combinational duplicates meaning title + author could repeat with two different ISBN codes, or there could be two rows where one contains title, author, ISBN, other contains title, description, ISBN and we may need to combine them, or similar descriptions where other fields are different. Duplicates matter for indexing and recommendation quality. Duplicates could create bias towards the duplicated entity and we need to avoid that.

In [None]:
# Genre

In [None]:
# Duplicates

In [None]:
# Do descriptions have HTML tags?

# Are there escaped characters 