In [1]:
import scrapy
import pandas as pd
from scrapy.crawler import CrawlerProcess

# Dictionary of lists to store our results. Probably a better way to do this.
features = ['date', 'seen', 'score', 'helpful', 'review']
reviews = {x:list() for x in features}

class MALCrawler(scrapy.Spider):
    name = 'MAL Crawler'
    def start_requests(self):
        # List of urls to scrape
        #urls = ['https://myanimelist.net/anime/20/Naruto/reviews?p=' + str(x) for x in range(1,8)]
        urls = ['https://myanimelist.net/anime/34798/Yuru_Camp△/reviews?p=' + str(x) for x in range(1,9)]
        for url in urls:
            yield scrapy.Request(url = url, callback = self.parse)
    
    # What we're gonna parse from each url
    def parse(self, response):
        # Get dates
        for date in response.xpath('//div[@class="mb8"]/div[1]/text()').extract():
            reviews['date'].append(date)
        
        # Get episodes seen
        for seen in response.xpath('//div[@class="mb8"]/div[2]/text()').extract():
            reviews['seen'].append(seen.strip())
            
        # Get score
        for score in response.xpath('//div[@class="mb8"]/div[3]/text()[2]').extract():
            reviews['score'].append(score[1:].strip())
            
        # Get helpful
        for helpful in response.xpath('//span[contains(@id,"rhelp")]/text()').extract():
            reviews['helpful'].append(helpful)
            
        # Get reviews
        for element in response.xpath('//div[@class="spaceit textReadability word-break pt8 mt8"]'):
            # Grab all instances of text (will be several having been broken up by br tags)
            text = [x.strip() for x in element.xpath('./text()').extract()]
            # Converts text into a string when you join it. 
            text_preview = " ".join(text)

            # So the above gets the previewed text. We now want the text body.
            text = [x.strip() for x in element.xpath('./span[contains(@id,review)]/text()').extract()]
            text_body = " ".join(text)

            # We now want to combine the preview and body together into a list
            text_review = text_preview + ' ' + text_body
            reviews['review'].append(text_review)
        
process = CrawlerProcess()

process.crawl(MALCrawler)

process.start()

2021-01-15 14:36:03 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-01-15 14:36:03 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Windows-7-6.1.7601-SP1
2021-01-15 14:36:03 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-01-15 14:36:03 [scrapy.crawler] INFO: Overridden settings:
{}
2021-01-15 14:36:03 [scrapy.extensions.telnet] INFO: Telnet Password: c5562cf66d6a2f58
2021-01-15 14:36:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-01-15 14:36:04 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.

In [3]:
# Looks good so far!
df = pd.DataFrame(reviews)

# #df.to_csv('Datacamp CSV/Naruto_Reviews.csv')
# df['helpful'] = df['helpful'].astype(int)
# df.sort_values(['helpful'], ascending = False)

df.to_csv('Datacamp CSV/YuruCamp_Reviews.csv')

In [2]:
# Looks good so far!
df = pd.DataFrame(reviews)

#df.to_csv('Datacamp CSV/Naruto_Reviews.csv')
df

Unnamed: 0,date,seen,score,helpful,review
0,"Apr 27, 2008",220 of 220 episodes seen,7,1060,"To be honest, Naruto is the most overhyped sh..."
1,"Jun 25, 2008",220 of 220 episodes seen,7,730,I began to browse through the reviews of Naru...
2,"Dec 3, 2011",220 of 220 episodes seen,10,344,"Naruto, yes Naruto. One of the series that ha..."
3,"Apr 30, 2011",220 of 220 episodes seen,8,321,"I'm sure most, if not all, the people who end..."
4,"Dec 18, 2014",220 of 220 episodes seen,4,227,I would like to start this review off by sayi...
5,"Jun 26, 2010",180 of 220 episodes seen,10,203,I've heard a lot of people call this the most...
6,"Dec 19, 2008",220 of 220 episodes seen,1,166,ok so I've seen like 200+ series and i have t...
7,"Jun 8, 2009",40 of 220 episodes seen,3,140,"Proof that I will watch just about anything, ..."
8,"Sep 3, 2008",220 of 220 episodes seen,9,116,Having watched many different series of anime...
9,"Feb 5, 2009",220 of 220 episodes seen,10,110,"Naruto is a hit or miss show. You love it, or..."


In [None]:
# Test trying to look at MAL for anime reviews

import requests
import pandas as pd
from scrapy import Selector

# Main structure of URL is relatively easy, we can loop from p=1 to p=N
url = 'https://myanimelist.net/anime/20/Naruto/reviews?p=1'

html = requests.get(url).content

sel = Selector(text = html)

In [71]:
# We want certain information from each review page. 
# 1. Overall score
# 2. Number of episodes seen
# 3. Actual Review
# 4. Date of Review
# 5. No. of people who found it useful.

# How to identify each within the html block?

# Get the date of each review on the page
# Each date, episodes seen, and score are contained within the mb8 class. 
# Looking at the pattern, div[1] is the date, div[2] is the number of episodes seen, and 
# div[4] is the actual score

reviews = {}

# All dates
reviews['date'] = sel.xpath('//div[@class="mb8"]/div[1]/text()').extract()

# All episodes seen
reviews['seen'] = [x.strip() for x in sel.xpath('//div[@class="mb8"]/div[2]/text()').extract()]

# All scores
reviews['score'] = [x[1:].strip() for x in sel.xpath('//div[@class="mb8"]/div[3]/text()[2]').extract()]

# Let's try and find out how to get the number of people seen
# It seems that it's contained within a span tag that contains an id with rhelp'reviewno.'
reviews['help'] = sel.xpath('//span[contains(@id,"rhelp")]/text()').extract()

# Finally, lets figure out how to get the actual review.
# This is where it gets kind of weird. It seems that reviews are contained within two parts. 
# One within a div tag with @class="spaceit...etc.'' and a span tag with id = review'reviewno.'
# We need to get both parts

# Another problem is that there are a ton of <br> tags, which really fuck up the text because
# they separate them into different instances of text each time. We'll need to 
# make a loop to attach them again
review = list()

for element in sel.xpath('//div[@class="spaceit textReadability word-break pt8 mt8"]'):
    # Grab all instances of text (will be several having been broken up by br tags)
    text = [x.strip() for x in element.xpath('./text()').extract()]
    # Converts text into a string when you join it. 
    text_preview = " ".join(text)
    
    # So the above gets the previewed text. We now want the text body.
    text = [x.strip() for x in element.xpath('./span[contains(@id,review)]/text()').extract()]
    text_body = " ".join(text)
    
    # We now want to combine the preview and body together into a list
    review.append(text_preview + ' ' + text_body)

# Add it to our dictionary
reviews['review'] = review

In [104]:
# Very cool, we've managed to collect 20 reviews by scraping the MAL site for Naruto!
# We can now attempt to create a scrapy spider that will attempt to grab around 400 reviews 
# for Naruto.
df = pd.DataFrame(reviews)