# Data Wrangling and Cleaning for Goodreads Recommendation System

### Import necessary modules and libraries:

In [1]:
import pandas as pd
import numpy as np

### 1. Diving into the data:

In [2]:
# Read CSV file into pandas dataframe
unclean_data = pd.read_csv('Raw_data/goodreadsbooks/books.csv', error_bad_lines=False)

b'Skipping line 4012: expected 10 fields, saw 11\nSkipping line 5688: expected 10 fields, saw 11\nSkipping line 7056: expected 10 fields, saw 11\nSkipping line 10601: expected 10 fields, saw 11\nSkipping line 10668: expected 10 fields, saw 11\n'


In [3]:
# Explore basic information about unclean data
unclean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13714 entries, 0 to 13713
Data columns (total 10 columns):
bookID                13714 non-null int64
title                 13714 non-null object
authors               13714 non-null object
average_rating        13714 non-null float64
isbn                  13714 non-null object
isbn13                13714 non-null int64
language_code         13714 non-null object
# num_pages           13714 non-null int64
ratings_count         13714 non-null int64
text_reviews_count    13714 non-null int64
dtypes: float64(1), int64(5), object(4)
memory usage: 1.0+ MB


In [4]:
# Look at head of dataframe
unclean_data.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,# num_pages,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling-Mary GrandPré,4.56,0439785960,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling-Mary GrandPré,4.49,0439358078,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling-Mary GrandPré,4.47,0439554934,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.41,0439554896,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling-Mary GrandPré,4.55,043965548X,9780439655484,eng,435,2149872,33964


In [5]:
# Locate all instances of J.K. Rowling to observe how author is listed in authors column
unclean_data.loc[unclean_data['authors'].str.contains('J.K. Rowling')]

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,# num_pages,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling-Mary GrandPré,4.56,0439785960,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling-Mary GrandPré,4.49,0439358078,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling-Mary GrandPré,4.47,0439554934,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.41,0439554896,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling-Mary GrandPré,4.55,043965548X,9780439655484,eng,435,2149872,33964
5,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling-Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,38872,154
7,10,Harry Potter Collection (Harry Potter #1-6),J.K. Rowling,4.73,0439827604,9780439827607,eng,3342,27410,820
692,2000,Conversations with J.K. Rowling,Lindsey Fraser-J.K. Rowling,3.99,0439314550,9780439314558,eng,96,4184,85
693,2002,Harry Potter Schoolbooks Box Set: Two Classic ...,J.K. Rowling,4.4,043932162X,9780439321624,eng,240,11459,143
695,2005,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.56,0747584664,9780747584667,eng,768,1173,72


In [6]:
# Create clean dataframe to store and eventually save all cleaned data.
clean_data = unclean_data.loc[:, 'bookID':'title']
clean_data.head()

Unnamed: 0,bookID,title
0,1,Harry Potter and the Half-Blood Prince (Harry ...
1,2,Harry Potter and the Order of the Phoenix (Har...
2,3,Harry Potter and the Sorcerer's Stone (Harry P...
3,4,Harry Potter and the Chamber of Secrets (Harry...
4,5,Harry Potter and the Prisoner of Azkaban (Harr...


### 2. Mutliple Authors for Certain Books in the Author Column
There are multiple authors for certain books, such as Good Omens by Terry Prachet and Neil Gaiman, as well as some illustrators that are included in the author column. Looking at the first few rows for Harry Potter, we see Mary Grandpre, who was an illustrator for the book series, is included here. looking at further rows, there are also translators that are included in this column alongside the original authors of the books. In order to more accurately present the data without removing actual second authors, we will split the authors' column into two seperate columns. The first author will be noted as the primary author (as is the case with many of these books with multiple authors), and the second author as the secondary.

In [7]:
# Split unclean dataframe authors column at first '-' to ensure primary author has their own column
clean_data[['primary_a','secondary_a']] = unclean_data.authors.str.split('-', 1, expand=True)
clean_data.head()

Unnamed: 0,bookID,title,primary_a,secondary_a
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,Mary GrandPré
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,Mary GrandPré
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling,Mary GrandPré
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,Mary GrandPré


### 3. Join Rest of Data to Clean Dataframe

In [8]:
# Join 'average_rating' through 'text_reviews_count' columns from unclean data into the clean 
clean_data = clean_data.join(unclean_data.loc[:,'average_rating':'text_reviews_count'])

In [9]:
clean_data.head()

Unnamed: 0,bookID,title,primary_a,secondary_a,average_rating,isbn,isbn13,language_code,# num_pages,ratings_count,text_reviews_count
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,Mary GrandPré,4.56,0439785960,9780439785969,eng,652,1944099,26249
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,Mary GrandPré,4.49,0439358078,9780439358071,eng,870,1996446,27613
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,J.K. Rowling,Mary GrandPré,4.47,0439554934,9780439554930,eng,320,5629932,70390
3,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,,4.41,0439554896,9780439554893,eng,352,6267,272
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,Mary GrandPré,4.55,043965548X,9780439655484,eng,435,2149872,33964


In [10]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13714 entries, 0 to 13713
Data columns (total 11 columns):
bookID                13714 non-null int64
title                 13714 non-null object
primary_a             13714 non-null object
secondary_a           5494 non-null object
average_rating        13714 non-null float64
isbn                  13714 non-null object
isbn13                13714 non-null int64
language_code         13714 non-null object
# num_pages           13714 non-null int64
ratings_count         13714 non-null int64
text_reviews_count    13714 non-null int64
dtypes: float64(1), int64(5), object(5)
memory usage: 1.2+ MB


### 4. Save Clean DataFrame to Clean Folder

In [11]:
clean_data.to_csv('Clean_data\clean_books.csv')