# Data preprocessing

In this notebook, I am going to take the raw book data and prepare it for the modeling stage. 

In [1]:
# Import the necessary libraries 
import numpy as np 
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the data 
books = pd.read_csv('../data/raw/books.csv')
ratings = pd.read_csv('../data/raw/ratings.csv')
tags = pd.read_csv('../data/raw/tags.csv')
book_tags = pd.read_csv('../data/raw/book_tags.csv')
to_read = pd.read_csv('../data/raw/to_read.csv')

In [3]:
books.replace(to_replace='J.K. Rowling, Mary GrandPré', value = 'J.K. Rowling', inplace=True)

In [4]:
to_read_counts = pd.DataFrame(to_read['book_id'].value_counts())
to_read_counts = to_read_counts.rename(columns={'book_id':'to_read_count'})
to_read_counts.index.name ="book_id"
to_read_counts.reset_index(inplace=True)
to_read_counts.head()

Unnamed: 0,book_id,to_read_count
0,47,2772
1,143,1967
2,113,1840
3,13,1812
4,11,1767


In [5]:
books=books.merge(to_read_counts, on='book_id')
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,to_read_count
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,973
1,2,3,3,4640799,491,439554934,9780440000000.0,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,400
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,287
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,1478
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,1293


In [6]:
books=books.drop(['image_url','small_image_url'], axis=1)

In [7]:
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [8]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [9]:
tag_merged = book_tags.merge(tags, on='tag_id')
tag_merged.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [11]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,to_read_count
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,973
1,2,3,3,4640799,491,439554934,9780440000000.0,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,...,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,400
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,287
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,1478
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,1293


### Genres
I am going to hardcode some basic genres since the tags list is so varied and unclean. I will base it off of the most popular tags.  

In [13]:
# Top 50 tags
top_tags = sorted(tag_merged.sort_values('count',ascending=False).tag_name.unique()[:50])

In [14]:
# Hard code some popular genres
genres=["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics", "Comics", \
        "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction", "Gay and Lesbian", \
        "Graphic Novels", "Historical Fiction", "History", "Horror", "Humor and Comedy", "Manga", \
        "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal", "Philosophy", "Poetry", "Psychology", \
        "Religion", "Romance", "Science", "Science Fiction", "Self Help", "Suspense", "Spirituality", \
        "Sports", "Thriller", "Travel", "Young Adult"]
for i in range(len(genres)):
    genres[i]=genres[i].lower()

for genre in top_tags: 
    if genre not in genres: 
        genres.append(genre)

In [15]:
len(genres)

72

In [16]:
new_tags=tag_merged[tag_merged.tag_name.isin(genres)]

In [17]:
new_tags.sort_values('count', ascending=False)

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
84,865,30574,596234,to-read
6140,2429135,30574,586235,to-read
9108,18143977,30574,505884,to-read
2,3,30574,496107,to-read
1671,24280,30574,488469,to-read
...,...,...,...,...
30195,2693801,8717,1,currently-reading
479243,3061,14552,1,history
526263,452157,32865,1,writing
724531,7011879,6750,1,chick-lit


This has helped us keep tags to a minimum, helping computation, while still adding a lot of value! 

Let's add them to the books dataframe. 

In [18]:
print(books.shape)

(9986, 22)


In [19]:
# make a new row per item per tag
books=books.join(
    pd.concat(
        [new_tags.set_index('goodreads_book_id')['tag_name']],
        axis=1, keys='tags' #list(range(len(new_tags)))
    ), on='goodreads_book_id')

In [20]:
# # Aggregate the tags into a single column
books=books.groupby('book_id').agg({'goodreads_book_id':'first', 'best_book_id':'first', 'work_id':'first',
       'books_count':'first', 'isbn':'first', 'isbn13':'first', 'authors':'first', 'original_publication_year':'first',
       'original_title':'first', 'title':'first', 'language_code':'first', 'average_rating':'first',
       'ratings_count':'first', 'work_ratings_count':'first', 'work_text_reviews_count':'first',
       'ratings_1':'first', 'ratings_2':'first', 'ratings_3':'first', 'ratings_4':'first', 'ratings_5':'first',
       'to_read_count':'first','t': ', '.join}).reset_index()

In [21]:
books.rename(columns={'t':'tags'}, inplace=True)

In [22]:
print(books.shape)
books.head()

(9986, 23)


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,to_read_count,tags
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,973,"to-read, fantasy, favorites, currently-reading..."
1,2,3,3,4640799,491,439554934,9780440000000.0,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,400,"to-read, fantasy, favorites, currently-reading..."
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,287,"to-read, fantasy, favorites, currently-reading..."
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,1478,"to-read, favorites, currently-reading, young-a..."
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,1293,"to-read, favorites, currently-reading, young-a..."


Now that we have added the tags and info about the to-read shelf, I am ready to move to the modeling phase. 

In [39]:
# Save the data
datapath = '../data/processed/'
books.to_csv("../data/processed/output.csv", index=False)
