# This notebook is for merging the scraped data (authors stats and books discussions stats)

In [34]:
import pandas as pd

In [35]:
BASE_PATH = "/home/mh06/dsti/new/dsti-ml-book-ratings/data/raw/scraped"
discussions = f"{BASE_PATH}/discussion/books_discussion_latest_cleaned.csv"
author_stats = f"{BASE_PATH}/books_written_followers_latest_cleaned.csv"

df_discussions = pd.read_csv(discussions, sep=',', on_bad_lines='warn')
df_authors = pd.read_csv(author_stats, sep=',', on_bad_lines='warn')

print(df_discussions.shape)
print(df_authors.shape)

(11127, 16)
(11123, 15)


There is a difference of 5 rows between the these two datasets, because when we first started scraping, for authors information (df_authors) we ignored the 5 bad rows where there were one more column, but for the discussions stats scraping we started by fixing these five rows manually.

In order to simplify the data merge and moving further we will drop these rows from df_discussions.

In [36]:
print(df_discussions.columns)
print(df_authors.columns)

Index(['Unnamed: 0', 'bookID', 'title', 'authors', 'average_rating', 'isbn',
       'isbn13', 'language_code', 'num_pages', 'ratings_count',
       'text_reviews_count', 'publication_date', 'publisher', 'quotes',
       'discussions', 'questions'],
      dtype='object')
Index(['Unnamed: 0', 'bookID', 'title', 'authors', 'average_rating', 'isbn',
       'isbn13', 'language_code', '  num_pages', 'ratings_count',
       'text_reviews_count', 'publication_date', 'publisher', 'written_books',
       'followers'],
      dtype='object')


In [37]:
df_authors.drop(columns=['Unnamed: 0'], inplace=True)
df_authors.rename(columns={'  num_pages': 'num_pages'}, inplace=True)
df_authors.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', 'num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher', 'written_books', 'followers'],
      dtype='object')

In [38]:
columns_to_merge = ['bookID', 'quotes', 'discussions', 'questions']
df_discussions = df_discussions[columns_to_merge]
df_discussions.columns

Index(['bookID', 'quotes', 'discussions', 'questions'], dtype='object')

In [39]:
merged_df = df_authors.merge(df_discussions, on='bookID', how='inner')
merged_df

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,written_books,followers,quotes,discussions,questions
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,609.0,22200.0,882.0,194.0,74.0
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,627.0,22200.0,1152.0,307.0,107.0
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,609.0,22200.0,745.0,483.0,127.0
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,627.0,22200.0,747.0,417.0,121.0
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,627.0,22200.0,1.0,20.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11118,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,1560254416,9781560254416,eng,512,156,20,12/21/2004,Da Capo Press,85.0,1093.0,1.0,1.0,0.0
11119,45633,You Bright and Risen Angels,William T. Vollmann,4.08,0140110879,9780140110876,eng,635,783,56,12/1/1988,Penguin Books,85.0,1093.0,23.0,1.0,0.0
11120,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,0140131965,9780140131963,eng,415,820,95,8/1/1993,Penguin Books,85.0,1093.0,9.0,16.0,0.0
11121,45639,Poor People,William T. Vollmann,3.72,0060878827,9780060878825,eng,434,769,139,2/27/2007,Ecco,85.0,1093.0,4.0,1.0,0.0


### Save merged file

In [40]:
filename = f"{BASE_PATH}/books_merged.csv"
merged_df.to_csv(filename, index=False)