# Scraping MERISTATION

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import newspaper
from newspaper import Article
from newspaper import Source
from newspaper import fulltext

## Scraping Articles Site 

In [2]:
review_url = 'https://as.com/meristation/analisis/'
html = requests.get(review_url).content
html[:50]

b'<!DOCTYPE html>\n<html lang="es">\n<head>\n<meta char'

In [3]:
soup = BeautifulSoup(html, 'lxml')
articles = soup.find_all('h2')

In [4]:
#Extraer títulos

titles = []

for a in articles:
    if a.find('a') == None:
        pass
    else:
        titles.append(a.find('a')['title'])
    
titles[0:5]

['OkunoKA Madness, análisis',
 'art of rally, análisis',
 'Crash Bandicoot 4, Análisis. Celebración plataformera',
 'WWE 2K Battlegrounds, análisis. La lucha libre nos vuelve a romper el corazón',
 'Turtle Beach Stealth 700 Gen 2, Análisis']

In [5]:
#Extraer links

links = []

for a in articles:
    if a.find('a') == None:
        pass
    else:
        links.append(a.find('a')['href'])
    
links[0:6]

['https://as.com/meristation/2020/10/05/analisis/1601886314_918896.html',
 'https://as.com/meristation/2020/10/03/analisis/1601701228_240717.html',
 'https://as.com/meristation/2020/10/01/analisis/1601557244_384472.html',
 'https://as.com/meristation/2020/09/30/analisis/1601454855_979194.html',
 'https://as.com/meristation/2020/09/29/mexico/1601416248_335912.html',
 'https://as.com/meristation/2020/09/29/analisis/1601370434_254241.html']

## Page parsing and link retrieving function for Meristation

In [6]:
def meristation_link_retrieve(num_pages):
    links = []
    
    # pages parser
    for i in range(num_pages):
        url = f'https://as.com/meristation/analisis/{331-i}'
        #print(url)

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('h2')

        # links retrieve
        for a in articles:
            if a.find('a') == None:
                pass
            else:
                links.append(a.find('a')['href'])
        
    #print(links)
    
    #Deleting non review links
    for link in links:
        if 'analisis' not in link:
            links.remove(link)
                
    return links


In [10]:
num_pages = 50

links = meristation_link_retrieve(num_pages)

print(len(links))
print(links[:5])

1457
['https://as.com/meristation/2020/10/05/analisis/1601886314_918896.html', 'https://as.com/meristation/2020/10/03/analisis/1601701228_240717.html', 'https://as.com/meristation/2020/10/01/analisis/1601557244_384472.html', 'https://as.com/meristation/2020/09/30/analisis/1601454855_979194.html', 'https://as.com/meristation/2020/09/29/analisis/1601370434_254241.html']


## Scraping Single Review

In [63]:
review_url = 'https://as.com/meristation/2020/09/29/analisis/1601370434_254241.html'
review_html = requests.get(review_url).content
review_html[:50]

b'<!DOCTYPE html>\n<html lang="es">\n<head>\n<meta char'

In [73]:
soup = BeautifulSoup(review_html, 'lxml')

In [74]:
p_tags = soup.find('div', {'class': 'art-body'}).find_all('p')

In [75]:
review = [tag.text for tag in p_tags]
review = ' '.join(review)

In [76]:
review[0:1000]

'"Un consejo, señor, no se acerque nunca al lago... Y sobre todo, tápese los oídos si oye cantar la voz bajo el agua...", El fantasma de la ópera" (1910), Gastón Leroux En un videojuego, lo mismo que en una película, poco importa si es grande o pequeño, si lo han hecho un estudio de 100 personas con 80 millones de presupuesto, o 5 con apenas unas decenas de miles. Lo que cuenta es su eficacia, su capacidad para lograr el fin último que su género busca. Maid of Sker logra eso en su estupenda primera parte, pero lo arruina en su segunda, en un juego con 2 mitades diferenciadas por sus mecánicas. Pero incluso tal como está, su historia merece ser descubierta. Tras un Don’t Knock Twice que ciertamente componía una gran atmósfera, el estudio Wales Interactive regresa este año con Maia of Sker, un título con el que buscan solventar los fallos del anterior al tiempo que contarnos una estupenda historia de horror cósmico - marítimo. El punto de partida es directo: Estás en el siglo XIX, en un 

In [77]:
author = soup.find('li', {'class': 'art-aut-wr'}).text.strip('\n')
author

'César Otero'

In [82]:
# Author
def meri_author():
    author = 'None'
    
    try:
        author = soup.find('p', {'class': 'art-aut-wr'}).find('a').text.strip('\n')

    except AttributeError:
        pass
    
    try:
        author = soup.find('li', {'class': 'art-aut-wr'}).text.strip('\n')

    except AttributeError:
        pass

    return author

In [83]:
meri_author()

'César Otero'

In [84]:
# Game

game = soup.find('div', {'class': 'ga-h-tl'}).text
game

'Maid of Sker'

In [85]:
# Genre, Company and Platform

infobox = soup.find('ul', {'class': 'li-inl'}).contents
for box in infobox:  
        
    if (type(box.find('span')) == int) or (box.find('span') == None):
        pass
    else:
        #print(box.find('span').text)
 
        if 'Plataforma' in box.find('span').text:
            group = box.find_all('a', {'class': 'rv-inline'})
            platform = [plat.contents[0].strip('\n') for plat in group]
            platform = ' '.join(platform)
            
        if 'Género' in box.find('span').text:
            genre = box.find('span', {'class': 'val'}).text
            
        if 'Editor' in box.find('span').text:
            company = box.find('span', {'class': 'val'}).text

print(platform, genre, company)

PC NSW PS4 XBO 
Aventura                     
Wales Interactive



In [94]:
# Score

def meri_score():
    #Initialize with most common score
    score = ''

    #3 different paths for score in Meristation
    try:
        score = soup.find('span', {'class': 'rv-sc sc-h'}).text

    except AttributeError:
        pass

    try:
        score = soup.find('span', {'class': 'rv-sc sc-m'}).text

    except AttributeError:
        pass

    try:
        score = soup.find('span', {'class': 'rv-sc sc-l'}).text

    except AttributeError:
        pass
   
    return score

In [95]:
meri_score()

'6.8'

## Create columns

In [123]:

def meristation_dict(links):
    reviews_dict = {}
    i = 0

    for link in links:
        try:
            
            #Request content and wait
            review_html = requests.get(link).content

            #Avoid get banned and timeout
            time.sleep(1)

            #Make a Soup and a Newspacer build
            soup = BeautifulSoup(review_html, 'lxml')
            #paper = newspaper.build(link) - Unnecesary

            #Site name - From Newspaper - Unnecesary
            #site = paper.brand

            #URL - From Newspaper - Unnecesary
            #url_link = paper.url

            #Game - From Scraping
            game = soup.find('div', {'class': 'ga-h-tl'}).text

            #Author - From Scraping
            author = 'None'
    
            try:
                author = soup.find('p', {'class': 'art-aut-wr'}).find('a').text.strip('\n')

            except AttributeError:
                pass

            try:
                author = soup.find('li', {'class': 'art-aut-wr'}).text.strip('\n')

            except AttributeError:
                pass

            #Company, Genre & Platform - From Scraping
            genre = 'None'
            company = 'None'
            platform = 'None'

            infobox = soup.find('ul', {'class': 'li-inl'}).contents
            for box in infobox:  

                if (type(box.find('span')) == int) or (box.find('span') == None):
                    pass
                else:
                    #print(box.find('span').text)

                    if 'Plataforma' in box.find('span').text:
                        group = box.find_all('a', {'class': 'rv-inline'})
                        platform = [plat.contents[0].strip('\n') for plat in group]
                        platform = ' '.join(platform).strip('\n')

                    if 'Género' in box.find('span').text:
                        genre = box.find('span', {'class': 'val'}).text.strip('\n')

                    if 'Editor' in box.find('span').text:
                        company = box.find('span', {'class': 'val'}).text.strip('\n')

            #Text & Cleaning - From Scraping
            p_tags = soup.find('div', {'class': 'art-body'}).find_all('p')
            review = [tag.text for tag in p_tags]
            review = ' '.join(review)

            #Score & Clean & Transform - From Scraping
            score = 7

            #3 different paths for score in Meristation
            try:
                score = soup.find('span', {'class': 'rv-sc sc-h'}).text

            except AttributeError:
                pass

            try:
                score = soup.find('span', {'class': 'rv-sc sc-m'}).text

            except AttributeError:
                pass

            try:
                score = soup.find('span', {'class': 'rv-sc sc-l'}).text

            except AttributeError:
                pass

            score = float(score)

            #Add to a dict
            reviews_dict[i] = {'site': 'meristation',
                               'url_link': link,
                               'author': author,
                               'game': game,
                               'company': company,
                               'genre': genre,
                               'platform': platform,
                               'text': review,
                               'score': score}
        
        except AttributeError:
            pass
        
        i = i+1
        if i % 25 == 0:
            print(i, ': ', link)

    return reviews_dict

In [124]:
result_meri = meristation_dict(links)


25 :  https://as.com/meristation/2020/09/13/analisis/1599985057_505947.html
50 :  https://as.com/meristation/2020/08/23/analisis/1598171552_285145.html
75 :  https://as.com/meristation/2020/07/17/analisis/1594966538_257377.html
100 :  https://as.com/meristation/2020/06/15/analisis/1592201158_744925.html
125 :  https://as.com/meristation/2020/05/21/analisis/1590050854_654858.html
150 :  https://as.com/meristation/2020/04/19/analisis/1587311094_874229.html
175 :  https://as.com/meristation/2020/03/24/analisis/1585040211_696453.html
200 :  https://as.com/meristation/2020/02/15/analisis/1581782942_260648.html
225 :  https://as.com/meristation/2020/01/27/analisis/1580134976_794348.html
250 :  https://as.com/meristation/2019/11/28/analisis/1574972538_081406.html
275 :  https://as.com/meristation/2019/11/08/analisis/1573209936_550115.html
300 :  https://as.com/meristation/2019/10/14/analisis/1571056460_855359.html
325 :  https://as.com/meristation/2019/09/25/analisis/1569403270_218618.html
35

In [125]:
print(result_meri[30])

{'site': 'meristation', 'url_link': 'https://as.com/meristation/2020/09/09/analisis/1599637698_672305.html', 'author': 'Sergio C. González\nSergio5Glez', 'game': 'Nexomon: Extinction', 'company': 'PQube', 'genre': 'RPG, Por turnos                    ', 'platform': 'PC PS4 NSW XBO', 'text': 'Son cada vez más los intentos por dar respuesta a Pokémon en la escena independiente, proyectos que nacen con la voluntad de servir como alternativa a la serie principal de la saga capitaneada por Game Freak. Al fin y al cabo, la fórmula “hazte con todos” no pasa de moda y, visto lo visto, parece que la fiebre por capturar, combatir e intercambiar monstruos en aventuras de rol por turnos está lejos de decir adiós. De todos esos intentos, seguramente Nexomon Extinction es la mejor alternativa de los últimos años. Hemos tardado más de 30 horas en completar la aventura del equipo canadiense VEWO Interactive, que ha contado con el apoyo de PQube para su distribución internacional y adaptación a consolas

### Create DataFrame

In [126]:
meristation = pd.DataFrame.from_dict(result_meri, orient='index')

meristation

Unnamed: 0,site,url_link,author,game,company,genre,platform,text,score
0,meristation,https://as.com/meristation/2020/10/05/analisis...,Carlos Forcada,OkunoKA Madness,Ignition Entertainment,Plataformas,XBO NSW PS4 PC,"\n\n Aunque parece que están ahí casi siempre,...",7.4
1,meristation,https://as.com/meristation/2020/10/03/analisis...,Cristian Ciuraneta,art of rally,Funselektor,Conducción,PC,Los fans de los videojuegos de carreras están ...,7.5
2,meristation,https://as.com/meristation/2020/10/01/analisis...,Sergio C. González\nSergio5Glez,Crash Bandicoot 4: It's About Time,Activision,Plataformas,PS4 XBO,\n\n Crash Bandicoot N. Sane Trilogy fue toda ...,8.3
3,meristation,https://as.com/meristation/2020/09/30/analisis...,David Arroyo,WWE 2K Battlegrounds,2K Games,Acción,NSW STD PS4 XBO PC,Superar una ruptura lleva tiempo. Nunca es fác...,5.7
4,meristation,https://as.com/meristation/2020/09/29/analisis...,César Otero,Maid of Sker,Wales Interactive,Aventura,PC NSW PS4 XBO,"""Un consejo, señor, no se acerque nunca al lag...",6.8
5,meristation,https://as.com/meristation/2020/09/29/analisis...,Azucena Ruíz,Pathfinder: Kingmaker,Deep Silver,"RPG, Acción",PC PS4 XBO,"Pathfinder: Kingmaker empezó, como muchos jueg...",7.5
6,meristation,https://as.com/meristation/2020/09/28/analisis...,Francisco J. Brenlla\nfranchuzas,The Outer Worlds: Peril on Gorgon,Private Division,"Acción, RPG",PC PS4 XBO,El anuncio de que Tim Cain y Leonard Boyarsky ...,7.5
7,meristation,https://as.com/meristation/2020/09/26/analisis...,Nacho Requena\nnachomol,Commandos 2 & Praetorians HD Remaster Double Pack,Kalypso Media,"Estrategia, Tiempo real",PC PS4 XBO,Hubo un tiempo en el que todo lo que tocaba o ...,7.0
8,meristation,https://as.com/meristation/2020/09/26/analisis...,Marta Oller\nmartaaax00,Here Be Dragons,Red Zero Games,"Estrategia, Por turnos",NSW PC IPH IPD AND,Como bien citaba el poeta José Espronceda lo d...,6.5
9,meristation,https://as.com/meristation/2020/09/25/analisis...,Jose Luis López de Garayo,Hades,Supergiant Games,"Aventura, Acción",PC NSW,Supergiant Games sigue un patrón muy claro. Di...,9.3


In [127]:
#meristation.to_csv('../data/meristation_50p.csv', index=False)

In [128]:
len(result_meri)

1441