In [76]:
from pymongo import MongoClient
import pprint
import json
from bs4 import BeautifulSoup
import re

In [3]:

client = MongoClient('localhost', 27017)
db=client['reviews']
coll=db['user_reviews']

.find() creates a "cursor" which pulls the data. .limit() keeps you from pulling all the data. Wrapping the "cursor" in a list comprehension builds that cursor out iteratively into a list containing the things you want to manipulate. Ok? Cool. 

In [178]:
test = [x for x in coll.find().limit(1000)]
test[0]['userid']

7171593

now, its more complicated than that, because all of this is nested *again* So in the cursor list? you've got a dictonary that holds user_id and a list of reviews. so index into 'reviews' and you get to a list of your actual scraped bits, the data from goodreads. 

In [189]:
review = test[3]['reviews']

In [190]:
review

['<tr id="review_2072716103" class="bookalike review">\n  <td class="field checkbox"><label>checkbox</label><div class="value">      &nbsp;\n</div></td>  <td class="field position" style="display: none"><label>position</label><div class="value"></div></td>  <td class="field cover"><label>cover</label><div class="value">        <div class="js-tooltipTrigger tooltipTrigger" data-resource-type="Book" data-resource-id="204286">\n          <a href="/book/show/204286.The_Baby_Book"><img alt="The Baby Book: Everything You Need to Know About Your Baby from Birth to Age Two" id="cover_review_2072716103" src="https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1344271861l/204286._SX50_.jpg"></a>\n        </div>\n</div></td>  <td class="field title"><label>title</label><div class="value">    <a title="The Baby Book: Everything You Need to Know About Your Baby from Birth to Age Two" href="/book/show/204286.The_Baby_Book">\n      The Baby Book: Everything You Need to Know About Yo

you're going to want to make those giant strings of html into a handle-able-able object. So put it in a bowl with beatiful soup. 

In [52]:
soup = BeautifulSoup(review, 'html.parser')

In [53]:
print(soup.prettify())

<tr class="bookalike review" id="review_667385890">
 <td class="field checkbox">
  <label>
   checkbox
  </label>
  <div class="value">
  </div>
 </td>
 <td class="field position" style="display: none">
  <label>
   position
  </label>
  <div class="value">
  </div>
 </td>
 <td class="field cover">
  <label>
   cover
  </label>
  <div class="value">
   <div class="js-tooltipTrigger tooltipTrigger" data-resource-id="2767052" data-resource-type="Book">
    <a href="/book/show/2767052-the-hunger-games">
     <img alt="The Hunger Games (The Hunger Games, #1)" id="cover_review_667385890" src="https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1447303603l/2767052._SX50_.jpg"/>
    </a>
   </div>
  </div>
 </td>
 <td class="field title">
  <label>
   title
  </label>
  <div class="value">
   <a href="/book/show/2767052-the-hunger-games" title="The Hunger Games (The Hunger Games, #1)">
    The Hunger Games
    <span class="darkGreyText">
     (The Hunger Games, #1)
    </spa

In [198]:
title = soup.find_all(class_ = re.compile('title'))[0].text

title.split()

['title', 'The', 'Hunger', 'Games', '(The', 'Hunger', 'Games,', '#1)']

In [196]:
title = soup.find_all(class_ = re.compile('title'))[0].text
title

'title \n      The Hunger Games\n        (The Hunger Games, #1)\n'

In [205]:
pages = int(soup.find_all(class_ =re.compile('num_pages'))[0].text.split()[2])

In [211]:
av_rate = float(soup.find_all(class_ =re.compile('avg_rating'))[0].text.split()[2])
av_rate

4.33

In [212]:
num_rate = int(soup.find_all(class_ =re.compile('num_ratings'))[0].text.split()[2])

ValueError: invalid literal for int() with base 10: '5,936,319'

In [167]:
user_rating = soup.find_all(class_ =re.compile('field rating'))[0].text

**Putting it all together**

In [193]:
def gr_db_cleaner(find_lim = 10):
    '''
    a function that reads in goodreads user review tables gathered
    by the gr_scraper and returns a list for schema
    +++++++++++
    Atributes
    find_lim (int): how many user-reviews to clean.  
    +++++++
    '''
    
    client = MongoClient('localhost', 27017)
    db=client['reviews']
    collection=db['user_reviews']

    documents = [x for x in collection.find().limit(find_lim)]
    all_revs = []
    for idx, users in enumerate(documents):
        userid = documents[idx]['userid']
        review_list = documents[idx]['reviews']
        if len(review_list) ==0:
            sub_rev = [None, None, userid, None, None, None]
            all_revs.append(sub_rev)
        else:
            for review in review_list:
                soup = BeautifulSoup(review, 'html.parser')
                title = soup.find_all(class_ = re.compile('title'))[0].text
                pages = soup.find_all(class_ =re.compile('num_pages'))[0].text
                av_rate = soup.find_all(class_ =re.compile('avg_rating'))[0].text
                num_rate = soup.find_all(class_ =re.compile('num_ratings'))[0].text
                user_rating = soup.find_all(class_ =re.compile('field rating'))[0].text
                sub_rev = [title, pages, userid, user_rating, num_rate, av_rate]
                all_revs.append(sub_rev)
    return all_revs

In [195]:
gr_db_cleaner()

[[None, None, 7171593, None, None, None],
 [None, None, 6060131, None, None, None],
 ['title \n      The Hunger Games\n        (The Hunger Games, #1)\n',
  'num pages \n        374\n        pp\n\n',
  22197300,
  "Emme's rating\nit was amazing\n",
  'num ratings    5,936,319\n',
  'avg rating    4.33\n'],
 ['title \n      Divergent\n        (Divergent, #1)\n',
  'num pages \n        487\n        pp\n\n',
  22197300,
  "Emme's rating\nit was amazing\n",
  'num ratings    2,706,147\n',
  'avg rating    4.20\n'],
 ['title \n      Shiver\n        (The Wolves of Mercy Falls, #1)\n',
  'num pages \n        392\n        pp\n\n',
  22197300,
  "Emme's rating\nit was amazing\n",
  'num ratings    411,613\n',
  'avg rating    3.77\n'],
 ['title \n      Good Omens: The Nice and Accurate Prophecies of Agnes Nutter, Witch\n',
  'num pages \n        491\n        pp\n\n',
  22197300,
  "Emme's rating\nit was amazing\n",
  'num ratings    476,232\n',
  'avg rating    4.25\n'],
 ['title \n      The Hos

In [181]:
documents = [x for x in coll.find().limit(1000)]
documents[0]['userid']

7171593

In [183]:
documents[0]

{'_id': ObjectId('5e10abaa4243f69009db6d07'), 'reviews': [], 'userid': 7171593}

In [186]:
len(documents[3]['reviews'])

5

In [191]:
lst = []
len(lst)

0