In [1]:
#Suppress warnings thrown by different packages
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import datetime as dt

# Set Pandas Options to Display all rows & columns when displayed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set float places & displayed decimal digits
pd.options.display.float_format = '{:13,.2f}'.format

import matplotlib.pyplot as plt
import seaborn as sns
import plotly as pty
import re

In [2]:
# Read the Dataset
df = pd.read_csv(r"C:\Users\Ashis\Desktop\ML Project\Project 1\books.csv")
print('Dataframe Shape :', df.shape,'\n')

# No null values. But there's clearly a problem with 1 of the column names.
print(df.isna().sum())

# Delete the whitespace in column names
df.columns = df.columns.str.strip()
print('Dataframe columns :', df.columns,'\n')

Dataframe Shape : (11127, 12) 

bookID                0
title                 0
authors               0
average_rating        0
isbn                  0
isbn13                0
language_code         0
  num_pages           0
ratings_count         0
text_reviews_count    0
publication_date      0
publisher             0
dtype: int64
Dataframe columns : Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', 'num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object') 



In [3]:
# Visualizing Correlations, Univariate & Bivariate analysis using pandas-profiling

from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile.to_widgets()
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
profile.to_file(r"C:\Users\Ashis\Desktop\ML Project\Project 1\report.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# checking for duplicates. No duplicates found
df[df.duplicated(keep=False)]

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher


In [6]:
# Replace unwanted characters & whitespaces in author names
df.authors = [re.sub(r"\s+", " ", s) for s in df.authors]
df.title  = [re.sub(r"\s+", " ", s) for s in df.title]
df.publisher  = [re.sub(r"\s+", " ", s) for s in df.publisher]
df.authors = df.authors.str.replace('-', ' ')
df.publisher = df.publisher.str.replace('!', ' ')
df.title = df.title.str.replace('-', ' ')
df.title = df.title.str.replace('!', ' ')
df.title = df.title.str.replace('?', ' ')

# Display Dataframe rows
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown Son & Ferguson,0.0,851742718,9780850000000.0,eng,49,0,0,05-01-1977,Brown Son & Ferguson Ltd.
1,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9781590000000.0,eng,400,26,4,04-06-2004,Cold Spring Press
2,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,674842111,9780670000000.0,en-US,236,61,6,4/20/2004,Harvard University Press
3,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9781560000000.0,eng,342,38,4,1/15/1999,Huntington House Publishers
4,1,Harry Potter and the Half Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780440000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.


In [7]:
# Replace hypen in some dates to forward slash to make the representation uniform
df.publication_date = df.publication_date.str.replace('-', '/')

# Split & reconstitute each date to make the numeral representation uniform
df.publication_date =  [str(int(x.split('/')[0])) + '/' +  str(int(x.split('/')[1])) + '/' +  str(int(x.split('/')[2])) for x in df.publication_date]

# Conversion to date throws an error. So using Error handling to display the index of the errors & combine the indices in a list
err = []

for i, d in enumerate(df.publication_date):
    try:
        pd.to_datetime(d)
    except: 
        err.append(i)

# Print the rows with errors. Both dates show 31 days in June & November which is not possible. 
print('Date Error Rows :', '\n', df.loc[err,'publication_date'], '\n')

# Manually inputting dates in these rows via Goodreads Website
df.loc[8181, 'publication_date'] = '10/31/2000'
df.loc[11098, 'publication_date'] = '6/30/1982'

# Datetime conversion now runs error free
df.publication_date = pd.to_datetime(df.publication_date)

# Display all column datatypes
df.dtypes

Date Error Rows : 
 8181     11/31/2000
11098     6/31/1982
Name: publication_date, dtype: object 



bookID                         int64
title                         object
authors                       object
average_rating               float64
isbn                          object
isbn13                       float64
language_code                 object
num_pages                      int64
ratings_count                  int64
text_reviews_count             int64
publication_date      datetime64[ns]
publisher                     object
dtype: object

In [8]:
# Remove the parenthesized parts from authors & title columns; also create a new column 'raw title' to preserve old info
df.authors = df.authors.str.replace(r"\s*\(.*\)\s*", "", regex=True).str.lower()
df['raw_title'] = df.title.str.replace(r"\s*\(.*\)\s*", "", regex=True).str.lower()

# Removing accents from alphabets (for example, 'á' replaced with 'a')
df.title = df.title.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# Removing unwanted whitespaces from columns
df.authors = df.authors.str.strip()
df.publisher = df.publisher.str.strip()
df.title = df.title.str.strip()
df.raw_title = df.raw_title.str.strip()

In [9]:
# Removing Noise from publisher column & storing it in another column
df['raw_pub'] = df.publisher.str.replace(r"\s*\(.*\)\s*", "", regex=True).str.lower()
df.raw_pub = df.raw_pub.str.strip()
df.raw_pub = df.raw_pub.str.replace('limited', ' limited ').str.replace('limited', 'ltd.').str.replace(' ltd', ' ltd.').str.replace(' ltd ', ' ltd. ')
df.raw_pub = df.raw_pub.str.replace('&', ' & ').str.replace(' and ', ' & ')
df.raw_pub = df.raw_pub.str.replace('co.', ' co. ')
df.raw_pub = df.raw_pub.str.replace('company', ' company ').str.replace('co.', 'company')
df.raw_pub = df.raw_pub.str.replace('incorporated', ' incorporated ').str.replace('incorporated','inc.')
df.raw_pub = df.raw_pub.str.replace(' inc.', ' inc. ').str.replace(' inc ', ' inc. ').str.replace(' inc', ' inc. ')
df.raw_pub = df.raw_pub.str.replace("'", '').str.replace("-", ' ')
df.raw_pub = df.raw_pub.str.replace('books', ' books ').str.replace('books','book').str.replace('book','books')
df.raw_pub = df.raw_pub.str.replace('classics', ' classics ').str.replace(' classic ',' classics ')
df.raw_pub = df.raw_pub.str.replace('publications', ' publications ').str.replace('publication',' publication ').str.replace('publication','publications')
df.raw_pub = df.raw_pub.str.replace('publishers', ' publishers ').str.replace('publisher',' publisher ').str.replace('publisher','publishers')
df.raw_pub = df.raw_pub.str.replace('paperbacks',' paperbacks ').str.replace('paperback',' paperback ').str.replace('paperback',' paperbacks')
df.raw_pub = df.raw_pub.str.replace(' press',' press ').str.replace(' publ.',' publ. ').str.replace('publ.',' publications').str.replace('pubn.',' publications')
df.raw_pub = df.raw_pub.str.replace('u.s.a', 'usa').str.replace('u.k', 'uk')
df.raw_pub = df.raw_pub.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df.raw_pub = df.raw_pub.str.replace(' +', ' ')
df.raw_pub = df.raw_pub.str.strip()

In [10]:
# dropping non-informative & erroneous column 'isbn13'
df.drop('isbn13', axis=1, inplace=True)

# Seperate books with multiple authors into a list of the different author names 
df['authors_comb'] = df.authors.str.split('/')

# Creating seperate columns for multiple authors
authors = df.authors.str.split('/', expand=True)
cols = ['author_'+ str(x+1) for x in authors.columns]
authors.columns = cols 

#Concatenating with the parent dataframe
df = pd.concat([df,authors], axis=1)
df.replace({None: np.nan}, inplace=True)

# Dropping Authors without ratings which won't be useful for the model
no_ratings = df[df.average_rating==0].index
df = df.drop(no_ratings,axis=0)

df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,raw_title,raw_pub,authors_comb,author_1,author_2,author_3,author_4,author_5,author_6,author_7,author_8,author_9,author_10,author_11,author_12,author_13,author_14,author_15,author_16,author_17,author_18,author_19,author_20,author_21,author_22,author_23,author_24,author_25,author_26,author_27,author_28,author_29,author_30,author_31,author_32,author_33,author_34,author_35,author_36,author_37,author_38,author_39,author_40,author_41,author_42,author_43,author_44,author_45,author_46,author_47,author_48,author_49,author_50,author_51
1,16914,The Tolkien Fan's Medieval Reader,david e. smith,3.58,1593600119,eng,400,26,4,2004-04-06,Cold Spring Press,the tolkien fan's medieval reader,company d spring press,[david e. smith],david e. smith,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,12224,Streetcar Suburbs: The Process of Growth in Bo...,sam bass warner jr./sam b. warner,3.58,674842111,en-US,236,61,6,2004-04-20,Harvard University Press,streetcar suburbs: the process of growth in bo...,harvard university press,"[sam bass warner jr., sam b. warner]",sam bass warner jr.,sam b. warner,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,22128,Patriots (The Coming Collapse),james wesley rawles,3.63,156384155X,eng,342,38,4,1999-01-15,Huntington House Publishers,patriots,huntington house publications shers s,[james wesley rawles],james wesley rawles,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1,Harry Potter and the Half Blood Prince (Harry ...,j.k. rowling/mary grandpré,4.57,439785960,eng,652,2095690,27591,2006-09-16,Scholastic Inc.,harry potter and the half blood prince,scholastic inc. .,"[j.k. rowling, mary grandpré]",j.k. rowling,mary grandpré,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,2,Harry Potter and the Order of the Phoenix (Har...,j.k. rowling/mary grandpré,4.49,439358078,eng,870,2153167,29221,2004-09-01,Scholastic Inc.,harry potter and the order of the phoenix,scholastic inc. .,"[j.k. rowling, mary grandpré]",j.k. rowling,mary grandpré,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
# Saving the cleaned file to the project directory
df.to_csv(r"C:\Users\Ashis\Desktop\ML Project\Project 1\books_cleaned.csv")