In [76]:
from pymongo import MongoClient
import pprint
import json
from bs4 import BeautifulSoup
import re

In [3]:

client = MongoClient('localhost', 27017)
db=client['reviews']
coll=db['user_reviews']

.find() creates a "cursor" which pulls the data. .limit() keeps you from pulling all the data. Wrapping the "cursor" in a list comprehension builds that cursor out iteratively into a list containing the things you want to manipulate. Ok? Cool. 

In [178]:
test = [x for x in coll.find().limit(1000)]
test[0]['userid']

7171593

now, its more complicated than that, because all of this is nested *again* So in the cursor list? you've got a dictonary that holds user_id and a list of reviews. so index into 'reviews' and you get to a list of your actual scraped bits, the data from goodreads. 

In [189]:
review = test[3]['reviews']

you're going to want to make those giant strings of html into a handle-able-able object. So put it in a bowl with beatiful soup. 

In [52]:
soup = BeautifulSoup(review, 'html.parser')

In [243]:
# print(soup.prettify())

In [242]:
title = soup.find_all(class_ = re.compile('title'))[0].text.split('\n')[1].strip()
title


'The Hunger Games'

In [196]:
title = soup.find_all(class_ = re.compile('title'))[0].text
title

'title \n      The Hunger Games\n        (The Hunger Games, #1)\n'

In [252]:
pages = int(soup.find_all(class_ =re.compile('num_pages'))[0].text.split()[2])
pages

374

In [211]:
av_rate = float(soup.find_all(class_ =re.compile('avg_rating'))[0].text.split()[2])
av_rate

4.33

In [214]:
num_rate = int(soup.find_all(class_ =re.compile('num_ratings'))[0].text.split()[2].replace(',',''))
num_rate

5936319

In [232]:
def str_to_rate(qual_state):
    '''
    a function that turns goodreads's "I liked it" or "I did not like it" star categories
    into the numerical 1-5 rating that they visually imply. 
    ++++++
    Attributes
    qual_state (list) a split string pulled from the beautiful soup output of .text on the rating object
    ++++++
    Returns
    user_rating (int): 1-5 score based on NUMBER OF STARS SELECTED BY THE RATER. I honestly don't understand why that's not the output in the HTML. 
    '''
    if qual_state[-3:] == ['it', 'was', 'amazing']:
        user_rating = 5
    elif qual_state[-3:] ==['really','liked','it']:
        user_rating = 4
    elif qual_state[-2:] ==['liked', 'it']:
        '''note that I belive any that include "really" will be given 4
        before we get to this elif statement, therefore we don't need 
        to worry about the issues of "really liked it" and "liked it"
        overlapping'''
        user_rating= 3
    elif qual_state[-3:]==['it','was','ok']:
        user_rating = 2
    elif qual_state[-3:]==['not','like','it']:
        user_rating = 1
    else:
        user_rating = 0
    return user_rating



In [233]:
user_rating = str_to_rate(soup.find_all(class_ =re.compile('field rating'))[0].text.split())
user_rating

5

**Putting it all together**

In [255]:
def gr_db_cleaner(find_lim = 10):
    '''
    a function that reads in goodreads user review tables gathered
    by the gr_scraper and returns a list for schema
    +++++++++++
    Atributes
    find_lim (int): how many user-reviews to clean.  
    +++++++
    '''
    
    client = MongoClient('localhost', 27017)
    db=client['reviews']
    collection=db['user_reviews']

    documents = [x for x in collection.find().limit(find_lim)]
    all_revs = []
    for idx, users in enumerate(documents):
        userid = documents[idx]['userid']
        review_list = documents[idx]['reviews']
        if len(review_list) ==0:
            sub_rev = [None, None, userid, None, None, None]
            all_revs.append(sub_rev)
        else:
            for review in review_list:
                soup = BeautifulSoup(review, 'html.parser')
                title = soup.find_all(class_ = re.compile('title'))[0].text.split('\n')[1].strip()
                try:
                    pages = int(soup.find_all(class_ =re.compile('num_pages'))[0].text.split()[2])
                except:
                    pages = None
                av_rate = float(soup.find_all(class_ =re.compile('avg_rating'))[0].text.split()[2])
                num_rate = int(soup.find_all(class_ =re.compile('num_ratings'))[0].text.split()[2].replace(',',''))
                user_rating = str_to_rate(soup.find_all(class_ =re.compile('field rating'))[0].text.split())
                sub_rev = [title, pages, userid, user_rating, num_rate, av_rate]
                all_revs.append(sub_rev)
    return all_revs

In [256]:
gr_db_cleaner()

[[None, None, 7171593, None, None, None],
 [None, None, 6060131, None, None, None],
 ['The Hunger Games', 374, 22197300, 5, 5936319, 4.33],
 ['Divergent', 487, 22197300, 5, 2706147, 4.2],
 ['Shiver', 392, 22197300, 5, 411613, 3.77],
 ['Good Omens: The Nice and Accurate Prophecies of Agnes Nutter, Witch',
  491,
  22197300,
  5,
  476232,
  4.25],
 ['The Host', 620, 22197300, 5, 848921, 3.84],
 ['Sabriel', 491, 22197300, 5, 170803, 4.17],
 ['Lirael', 464, 22197300, 5, 100837, 4.29],
 ['Fire', 480, 22197300, 5, 152215, 4.11],
 ['Graceling', 471, 22197300, 5, 359270, 4.09],
 ['Inkheart', 563, 22197300, 5, 342693, 3.88],
 ['The Baby Book: Everything You Need to Know About Your Baby from Birth to Age Two',
  784,
  69817947,
  5,
  6955,
  4.17],
 ['The No-Cry Sleep Solution: Gentle Ways to Help Your Baby Sleep Through the Night',
  254,
  69817947,
  5,
  7776,
  3.49],
 ['Attachment Parenting: Instinctive Care for Your Baby and Young Child',
  336,
  69817947,
  5,
  415,
  3.98],
 ['Nigh

In [257]:
import numpy as np
import pandas as pd