In [1]:
import re
import json
import gzip
import numpy as np
import pandas as pd
import ast
from collections import defaultdict

In [2]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(str(l))

In [3]:
# PARSING DATA FILES FOR AUTHORS AND GENRES: COMICS-GRAPHICS AND FANTASY-PARANORMAL

booksComics = parse('/home/014491542/CMPE-256 HW2/Datasets/Comics-Graphics/goodreads_books_comics_graphic.json.gz')
reviewsComics = parse('/home/014491542/CMPE-256 HW2/Datasets/Comics-Graphics/goodreads_reviews_comics_graphic.json.gz')
booksFantasy = parse('/home/014491542/CMPE-256 HW2/Datasets//Fantasy-Paranormal/goodreads_books_fantasy_paranormal.json.gz')
reviewsFantasy = parse('/home/014491542/CMPE-256 HW2/Datasets//Fantasy-Paranormal/goodreads_reviews_fantasy_paranormal.json.gz')
authors = parse('/home/014491542/CMPE-256 HW2/Datasets/Authors/goodreads_book_authors.json.gz')

In [4]:
# FILTERING COLUMNS FOR BOOK DATA, AUTHORS DATA AND REVIEW DATA

book_cols = ['book_id','title','text_reviews_count','country_code','language_code','popular_shelves','is_ebook','average_rating','description','authors','num_pages']
review_cols = ['book_id','user_id','review_text','rating','n_votes','n_comments']
auth_cols = ['author_id','name']

In [5]:
# DATAFRAME GENERATION FOR COMIC BOOKS

bComics = defaultdict(list)
for book in booksComics:
    temp = json.loads(book)
    for key,val in temp.items():
        bComics[key].append(val)

bComics_df = pd.DataFrame(bComics)
bComics_df = bComics_df[book_cols]

print(bComics_df.shape)
print(bComics_df.columns)

(89411, 11)
Index(['book_id', 'title', 'text_reviews_count', 'country_code',
       'language_code', 'popular_shelves', 'is_ebook', 'average_rating',
       'description', 'authors', 'num_pages'],
      dtype='object')


In [6]:
# DATAFRAME GENERATION FOR FANTASY BOOKS

bFantasy = defaultdict(list)
for book in booksFantasy:
    temp = json.loads(book)
    for key,val in temp.items():
        bFantasy[key].append(val)

bFantasy_df = pd.DataFrame(bFantasy)
bFantasy_df = bFantasy_df[book_cols]

print(bFantasy_df.shape)
print(bFantasy_df.columns)

(258585, 11)
Index(['book_id', 'title', 'text_reviews_count', 'country_code',
       'language_code', 'popular_shelves', 'is_ebook', 'average_rating',
       'description', 'authors', 'num_pages'],
      dtype='object')


In [8]:
# DATAFRAME GENERATION FOR COMIC BOOKS REVIEWS

rComics = defaultdict(list)
for review in reviewsComics:
    temp = json.loads(review)
    for key,val in temp.items():
        rComics[key].append(val)

rComics_df = pd.DataFrame(rComics)
rComics_df = rComics_df[review_cols]

print(rComics_df.shape)
print(rComics_df.columns)

(542338, 6)
Index(['book_id', 'user_id', 'review_text', 'rating', 'n_votes', 'n_comments'], dtype='object')


In [7]:
# DATAFRAME GENERATION FOR FANTASY BOOKS REVIEWS

rFantasy = defaultdict(list)
for review in reviewsFantasy:
    temp = json.loads(review)
    for key,val in temp.items():
        rFantasy[key].append(val)

rFantasy_df = pd.DataFrame(rFantasy)
rFantasy_df = rFantasy_df[review_cols]

print(rFantasy_df.shape)
print(rFantasy_df.columns)

(3424641, 6)
Index(['book_id', 'user_id', 'review_text', 'rating', 'n_votes', 'n_comments'], dtype='object')


In [9]:
# DATAFRAME GENERATION FOR AUTHORS

authorsDict = defaultdict(list)
for author in authors:
    temp = json.loads(author)
    for key,val in temp.items():
        authorsDict[key].append(val)

author_df = pd.DataFrame(authorsDict)
author_df = author_df[auth_cols]

print(author_df.shape)
print(author_df.columns)

(829529, 2)
Index(['author_id', 'name'], dtype='object')


In [10]:
# MERGING FANTASY BOOKS AND REVIEWS DATAFRAMES ON BOOK ID

fantasyCombined = pd.merge(bFantasy_df,rFantasy_df,how='inner',on='book_id')
fantasyCombined['genre'] = 'Fantasy and Paranormal'
print(fantasyCombined.shape)

(3424641, 17)


In [11]:
# MERGING COMIC BOOKS AND REVIEWS DATAFRAMES ON BOOK ID

comicCombined = pd.merge(bComics_df,rComics_df,how='inner',on='book_id') 
comicCombined['genre'] = 'Comics and Graphics'
print(comicCombined.shape)

(542338, 17)


In [16]:
comicCombined.to_csv('/home/014491542/CMPE-256 HW2/Datasets/csv-files/Comics-Graphics.csv',index=False)

In [24]:
# CONCATENATING FOR FINAL DATASET

final_df = pd.concat([fantasyCombined,comicCombined],axis=0)
print(final_df.shape)

(3966979, 17)


In [25]:
# ADDING AUTHOR ID COLUMN TO THE FINAL DF

auth_id = list()
for author in final_df.authors.astype('str'):
    author = ast.literal_eval(author)
    auth_id.append(author[0]['author_id'])

final_df['author_id'] = auth_id

In [26]:
final_df = pd.merge(final_df,author_df,on='author_id',how='inner')
final_df.drop(columns=['author_id','authors','popular_shelves'],inplace=True)
final_df.rename(columns={'name':'author_name'},inplace=True)
final_df.dropna(inplace=True)
print(final_df.shape)

(3966979, 16)


In [27]:
final_df.head()

Unnamed: 0,book_id,title,text_reviews_count,country_code,language_code,is_ebook,average_rating,description,num_pages,user_id,review_text,rating,n_votes,n_comments,genre,author_name
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",7,US,eng,False,4.03,Omnibus book club edition containing the Ladie...,600,d202352e716421da4ef3f7937df7c3f7,Competent. I'm sure I read the first half year...,3,1,0,Fantasy and Paranormal,Barbara Hambly
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",7,US,eng,False,4.03,Omnibus book club edition containing the Ladie...,600,18b61748a52de671cb549d812b9740c8,"While these books are excellent overall, I thi...",4,0,0,Fantasy and Paranormal,Barbara Hambly
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",7,US,eng,False,4.03,Omnibus book club edition containing the Ladie...,600,24ec19e5301ffc5c8a030eac0d7a4ed2,A collection of two novels about the mercenary...,3,0,0,Fantasy and Paranormal,Barbara Hambly
3,438134,Bride of the Rat God,65,US,eng,False,3.76,Chrysanda Flamande was the sultriest vamp of t...,336,018fb46979eaef202859956d5ccb905b,I'm always impressed by how different the book...,4,0,0,Fantasy and Paranormal,Barbara Hambly
4,438134,Bride of the Rat God,65,US,eng,False,3.76,Chrysanda Flamande was the sultriest vamp of t...,336,c2579fbf15fdad5cff26de01fe973db6,Absolute hoot of a book. Early Hollywood setti...,4,0,0,Fantasy and Paranormal,Barbara Hambly


In [28]:
# DUMPING THE DATAFRAME TO CSV FILE

final_df.to_csv('/home/014491542/CMPE-256 HW2/Datasets/csv-files/DataSet.csv',index=False)