## Web scraping notebook

In this notebook we scrape goodreads website, using bookID to find the number of written books by the author and the number followers of the author


### Install and import dependencies

! pip install requests bs4 pandas numpy

In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Part 1: Scrape Data

#### Read dataset

In [204]:
version = 0
filename = '../../data/raw/scraped/books_written_followers_latest.csv'
#filename = f"../../data/raw/scraped/books_written_followers_version_{version}.csv"
df = pd.read_csv(filename, sep=',',  on_bad_lines='warn')

### Verify how many missing values on the column to update and get the indexes


In [209]:
# Initialize the books_written_and_followers column
if "books_written_and_followers" not in df.columns:
    df['books_written_and_followers']=np.nan

# replace empty strings if exists by NaN
df['books_written_and_followers'] = df['books_written_and_followers'].replace('',np.nan)

missing = df[df["books_written_and_followers"].isnull()].index
print(missing.shape)
version = missing.shape[0]

(5,)


#### Iterate over indexes of rows where missing values

In [210]:
for i in missing:
    # Get the bookID
    ID = df['bookID'][i]
    url = 'https://www.goodreads.com/book/show/' + str(ID)

    # Request the url
    r = requests.get(url)

    # Find the text in returned html
    soup = BeautifulSoup(r.content, 'html.parser')
    s = soup.find('div', class_='FeaturedPerson__infoPrimary')
    try:
        t = s.find('span', {'class': 'Text Text__body3 Text__subdued'})
    except AttributeError:
        t = None
    books_written = None
    if t is not None:
        books_written = t.text
    df.loc[i, 'books_written_and_followers'] = books_written

    print("-- ", i)
    print(books_written)

#df.to_csv(f"../../data/raw/scraped/books_written_followers_version_{version}.csv")

--  1271
None
--  5555

--  6393
None
--  6900
None
--  10355



#### Check how many missing values

In [211]:
# replace empty strings if exists by NaN
df['books_written_and_followers'] = df['books_written_and_followers'].replace('',np.nan)
df.isnull().sum()

Unnamed: 0.1                   0
Unnamed: 0                     0
bookID                         0
title                          0
authors                        0
average_rating                 0
isbn                           0
isbn13                         0
language_code                  0
  num_pages                    0
ratings_count                  0
text_reviews_count             0
publication_date               0
publisher                      0
books_written_and_followers    5
dtype: int64

#### Save to csv

In [None]:
df.to_csv(f"../../data/raw/scraped/books_written_followers_version_{version}.csv")
# when finished save into latest
#df.to_csv(f"../../data/raw/scraped/books_written_followers_latest.csv")

#### Run the previous steps until you get all the needed data

At the end, there still 5 rows that couldn't be retrieved by the code above, we consider them as missing values to impute on the preprocessing pipeline.

### Part 2:

#### Transform scraped data

Let's separate the two features we scraped from the web; written_books and followers

In [223]:
filename = "../../data/raw/scraped/books_written_followers_latest.csv"
df_final = pd.read_csv(filename, sep=',',  on_bad_lines='warn')
df_final.shape

(11123, 15)

In [224]:
df_final.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,books_written_and_followers
0,0,0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,609 books222k followers
1,1,1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,627 books222k followers
2,2,2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,609 books222k followers
3,3,3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,627 books222k followers
4,4,4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,627 books222k followers


#### Drop useless columns

In [225]:
df_final.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

#### Check for missing values or empty strings

In [226]:
df['books_written_and_followers'] = df['books_written_and_followers'].replace('',np.nan)
df.isnull().sum()

Unnamed: 0.1                   0
Unnamed: 0                     0
bookID                         0
title                          0
authors                        0
average_rating                 0
isbn                           0
isbn13                         0
language_code                  0
  num_pages                    0
ratings_count                  0
text_reviews_count             0
publication_date               0
publisher                      0
books_written_and_followers    5
dtype: int64

#### Reformat the values of books_written_and_followers

In [227]:
# Function to reformat the values
def reformat_value(value):
    value = str(value)
    value = value.replace('books', '/').replace('book', '/').replace('followers', '').replace('follower', '')
    #value = value.replace(',', '').replace('.', '').replace('k', '000')
    return value

# Apply the function to the DataFrame column
df['books_written_and_followers'].apply(reformat_value)


0           609 /222k 
1           627 /222k 
2           609 /222k 
3           627 /222k 
4           627 /222k 
             ...      
11118       85 /1,093 
11119       85 /1,093 
11120       85 /1,093 
11121       85 /1,093 
11122    10.1k /17.1k 
Name: books_written_and_followers, Length: 11123, dtype: object

In [228]:
df_final['books_written_and_followers'] = df_final['books_written_and_followers'].replace("\xa0",'',regex=True).replace(['books', 'book'], '/', regex=True).replace(['followers', 'follower'], '', regex=True).replace('k', '00',regex=True).str.replace(',', '', regex=True).str.replace('.', '')
df_final['books_written_and_followers']

0          609/22200
1          627/22200
2          609/22200
3          627/22200
4          627/22200
            ...     
11118        85/1093
11119        85/1093
11120        85/1093
11121        85/1093
11122    10100/17100
Name: books_written_and_followers, Length: 11123, dtype: object

#### Split into 2 columns

In [229]:
df_final[['written_books', 'followers']] = df_final['books_written_and_followers'].str.split('/', n=1, expand=True)

In [230]:
df_final.isnull().sum()

bookID                         0
title                          0
authors                        0
average_rating                 0
isbn                           0
isbn13                         0
language_code                  0
  num_pages                    0
ratings_count                  0
text_reviews_count             0
publication_date               0
publisher                      0
books_written_and_followers    5
written_books                  5
followers                      5
dtype: int64

In [231]:
# Check empty followers
print(df_final[df_final['followers'] == ''].shape[0], ' Empty strings in followers column')

# Replace empty followers with 0
df_final.loc[df_final['followers'] == '', 'followers'] = '0'

print(df_final[df_final['followers'] == ''].shape[0], ' Empty strings in followers column')

324  Empty strings in followers column
0  Empty strings in followers column


In [232]:
# Check empty written_books
print(df_final[df_final['written_books'] == ''].shape)

(0, 15)


#### Convert the columns type from object to int

In [235]:
df_final.dtypes

bookID                           int64
title                           object
authors                         object
average_rating                 float64
isbn                            object
isbn13                           int64
language_code                   object
  num_pages                      int64
ratings_count                    int64
text_reviews_count               int64
publication_date                object
publisher                       object
books_written_and_followers     object
written_books                   object
followers                       object
dtype: object

In [237]:
df_final['written_books'] = pd.to_numeric(df_final['written_books'], errors='coerce')
df_final['followers'] = pd.to_numeric(df_final['followers'], errors='coerce')
df_final.dtypes

bookID                           int64
title                           object
authors                         object
average_rating                 float64
isbn                            object
isbn13                           int64
language_code                   object
  num_pages                      int64
ratings_count                    int64
text_reviews_count               int64
publication_date                object
publisher                       object
books_written_and_followers     object
written_books                  float64
followers                      float64
dtype: object

#### Drop books_written_and_followers

In [240]:
df_final.drop(columns=['books_written_and_followers'], inplace=True)

In [241]:
df_final.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,written_books,followers
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,609.0,22200.0
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,627.0,22200.0
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,609.0,22200.0
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,627.0,22200.0
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,627.0,22200.0


 ### Save df_final

In [242]:
filename = "../../data/raw/scraped/books_written_followers_latest_cleaned.csv"
df_final.to_csv(filename)