In [2]:
import os
import sys
import json
import io
import re
import requests
from lxml import etree
from lxml import html
import pandas as pd

def add_modules():
    """
    Starting at the current directory and proceeding up the file system
    tree, search for a directory named `modules`.  If found, and if not
    already there, add to the Python module search path.
    
    Params: None
    
    Return: None
    """
    directory = "."
    levels = 0
    while not os.path.isdir(os.path.join(directory, "modules")) and \
          levels < 5:
        directory = os.path.join(directory, "..")
        levels += 1
    module_path = os.path.abspath(os.path.join(directory, "modules"))
    if os.path.isdir(module_path):
        if not module_path in sys.path:
            sys.path.append(module_path)

add_modules()
import util

datadir = util.resolve_dir("final-project")

myparser = etree.XMLParser(remove_blank_text=True)

In [3]:
def buildUrl(location, resource, protocol="https", extension=None, port=None):
    '''
    This function builds a url with the location, resource, protocol,
    extension, and port
    
    Parameters: location: the url location of the goodreads list
            resource: the url resource of the goodreads list
            protocol: the url protocol, defaulting to https
            extension: the url extension, defaulting to none
            port: the port of the url, defaulting to none
            
    Return: the url created
    '''
    if(port != None):
        location = location + ':{}'.format(port)
    
    if(resource[0] != '/'):
        resource = '/' + resource
    if(extension != None):
        resource = resource + '.' + extension
        
    return protocol + '://' + location + resource

In [4]:
def createRequest(location, resource, protocol="https", extension=None, port=None):
    '''
    This function builds a url and creates a request with the url.
    
    Parameters: location: the url location of the goodreads list
            resource: the url resource of the goodreads list
            protocol: the url protocol, defaulting to https
            extension: the url extension, defaulting to none
            port: the port of the url, defaulting to none
            
    Return: the url request
    '''
    url = buildUrl(location, resource, protocol, extension, port)
    resp = requests.get(url)
    assert resp.status_code == 200
    return resp

In [5]:
def HTMLtoXML(location, resource, protocol="https", extension=None, port=None):
    '''
    This function takes the html of a url and makes it into an xml root.
    
    Parameters: location: the url location of the goodreads list
            resource: the url resource of the goodreads list
            protocol: the url protocol, defaulting to https
            extension: the url extension, defaulting to none
            port: the port of the url, defaulting to none
            
    Return: the root of the xml
    '''
    resp = createRequest(location, resource, protocol, extension, port)
    html_tree = io.BytesIO(resp.content)
    tabletree = html.parse(html_tree)
    tableroot = tabletree.getroot()
    return tableroot

In [31]:
def buildGoodReadsTable(location, resource, protocol="https", extension=None, port=None):
    '''
    This function builds a pandas table from a goodreads list of books.
    
    Parameters: location: the url location of the goodreads list
                resource: the url resource of the goodreads list
                protocol: the url protocol, defaulting to https
                extension: the url extension, defaulting to none
                port: the port of the url, defaulting to none
                
    Return: the pandas table
    '''
    tableroot = HTMLtoXML(location, resource, protocol, extension, port)
    titles = tableroot.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[5]/table//tr/td[3]/a/span/text()""")
    authors = tableroot.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[5]/table//tr/td[3]/span[2]/div/a/span/text()""")
    ratings = tableroot.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[5]/table//tr/td[3]/div[1]/span/span/text()""")
    book_resource = tableroot.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[5]/table//tr/td[3]/a/@href""")
    author_resource = tableroot.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[5]/table//tr/td[3]/span[2]/div/a/@href""")
    
    avg_rating = []
    total_reviews = []
    isbn = []
    author_rating = []
    author_works = []
    release = []
    
    for item in ratings:
        item = item.split(' ')
        avg_rating.append(float(item[1]))
        review = item[5].split(',')
        if(len(review) > 1):
            total_reviews.append(int(review[0]+review[1]))
        else:
            total_reviews.append(int(review[0]))
            
    for item in book_resource:
        try:
            book_root = HTMLtoXML(location, item)
            isbn.append(book_root.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[4]/div[1]/div[2]/div[5]/div[3]/div[1]/div[2]/div[2]/span/span/text()""")[0])
        except:
            isbn.append(None)
        try:
            book_root = HTMLtoXML(location, item)
            release.append(book_root.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[4]/div[1]/div[2]/div[5]/div[2]/text()""")[0].split('\n')[2].replace("  ", ""))
        except:
            release.append(None)
            
    for author in author_resource:
        try:
            author_root = HTMLtoXML("www.goodreads.com", author.split('https://www.goodreads.com')[1])
            if(author_root.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[2]/div[14]/div/div[2]/div/div[1]/span[2]/span/text()""") != []):
                author_rating.append(author_root.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[2]/div[14]/div/div[2]/div/div[1]/span[2]/span/text()""")[0])
                author_works.append(author_root.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[2]/div[14]/div/div[2]/div/div[1]/a[1]/text()""")[0].split(' ')[0])
            else:
                author_rating.append(author_root.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[2]/div[12]/div/div[2]/div/div[1]/span[2]/span/text()""")[0])
                author_works.append(author_root.xpath("""/html/body/div[2]/div[3]/div[1]/div[2]/div[3]/div[2]/div[12]/div/div[2]/div/div[1]/a[1]/text()""")[0].split(' ')[0])
        except:
            author_rating.append(None)
            author_works.append(None)


    DoL = {}
    DoL["title"] = titles
    DoL["author"] = authors
    DoL["avg_rating"] = avg_rating
    DoL["total_reviews"] = total_reviews
    DoL["ISBN"] = isbn
    DoL["release_date"] = release
    DoL["author_rating"] = author_rating
    DoL["author_works"] = author_works
    
    table = pd.DataFrame(DoL)
    table.dropna(subset=['ISBN'], inplace=True)
    return table


In [32]:
def createCVS(table, filename, datadir):
    '''
    This function creats a csv from a pandas table.
    
    Parameters: table: the pandas table that will be put into the csv
                filename: the name of the csv file
                datadir: the directory of the csv file
    
    Return: None
    '''
    table.to_csv(path_or_buf=os.path.join(datadir, filename), index=False)

In [33]:
def main():
    location = "www.goodreads.com"
    resource = "/list/show/146629.Best_Fantasy_of_the_2020s"

    goodreads_table = buildGoodReadsTable(location, resource)
    createCVS(goodreads_table, "goodreads_data.csv", datadir)
    assert(os.path.isfile("goodreads_data.csv"))

In [34]:
main()