# Scraping MERISTATION

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import newspaper
from newspaper import Article
from newspaper import Source
from newspaper import fulltext

## Scraping Articles Site 

In [2]:
review_url = 'https://as.com/meristation/analisis/'
html = requests.get(review_url).content
html[:50]

b'<!DOCTYPE html>\n<html lang="es">\n<head>\n<meta char'

In [3]:
soup = BeautifulSoup(html, 'lxml')
articles = soup.find_all('h2')

In [4]:
#Extraer títulos

titles = []

for a in articles:
    if a.find('a') == None:
        pass
    else:
        titles.append(a.find('a')['title'])
    
titles[0:5]

['OkunoKA Madness, análisis',
 'art of rally, análisis',
 'Crash Bandicoot 4, Análisis. Celebración plataformera',
 'WWE 2K Battlegrounds, análisis. La lucha libre nos vuelve a romper el corazón',
 'Turtle Beach Stealth 700 Gen 2, Análisis']

In [5]:
#Extraer links

links = []

for a in articles:
    if a.find('a') == None:
        pass
    else:
        links.append(a.find('a')['href'])
    
links[0:6]

['https://as.com/meristation/2020/10/05/analisis/1601886314_918896.html',
 'https://as.com/meristation/2020/10/03/analisis/1601701228_240717.html',
 'https://as.com/meristation/2020/10/01/analisis/1601557244_384472.html',
 'https://as.com/meristation/2020/09/30/analisis/1601454855_979194.html',
 'https://as.com/meristation/2020/09/29/mexico/1601416248_335912.html',
 'https://as.com/meristation/2020/09/29/analisis/1601370434_254241.html']

## Page parsing and link retrieving function for Meristation

In [6]:
def meristation_link_retrieve(num_pages):
    links = []
    
    # pages parser
    for i in range(num_pages):
        url = f'https://as.com/meristation/analisis/{281-i}'
        #print(url)

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('h2')

        # links retrieve
        for a in articles:
            if a.find('a') == None:
                pass
            else:
                links.append(a.find('a')['href'])
        
    #print(links)
    
    #Deleting non review links
    for link in links:
        if 'analisis' not in link:
            links.remove(link)
                
    return links


In [7]:
num_pages = 100

links = meristation_link_retrieve(num_pages)

print(len(links))
print(links[:5])

3000
['https://as.com/meristation/2016/06/22/analisis/1466583840_156341.html', 'https://as.com/meristation/2016/06/21/analisis/1466521200_156273.html', 'https://as.com/meristation/2016/06/21/analisis/1466517180_155958.html', 'https://as.com/meristation/2016/06/20/analisis/1466424780_156286.html', 'https://as.com/meristation/2016/06/12/analisis/1465766220_156029.html']


## Scraping Single Review

In [63]:
review_url = 'https://as.com/meristation/2020/09/29/analisis/1601370434_254241.html'
review_html = requests.get(review_url).content
review_html[:50]

b'<!DOCTYPE html>\n<html lang="es">\n<head>\n<meta char'

In [73]:
soup = BeautifulSoup(review_html, 'lxml')

In [74]:
p_tags = soup.find('div', {'class': 'art-body'}).find_all('p')

In [75]:
review = [tag.text for tag in p_tags]
review = ' '.join(review)

In [76]:
review[0:1000]

'"Un consejo, señor, no se acerque nunca al lago... Y sobre todo, tápese los oídos si oye cantar la voz bajo el agua...", El fantasma de la ópera" (1910), Gastón Leroux En un videojuego, lo mismo que en una película, poco importa si es grande o pequeño, si lo han hecho un estudio de 100 personas con 80 millones de presupuesto, o 5 con apenas unas decenas de miles. Lo que cuenta es su eficacia, su capacidad para lograr el fin último que su género busca. Maid of Sker logra eso en su estupenda primera parte, pero lo arruina en su segunda, en un juego con 2 mitades diferenciadas por sus mecánicas. Pero incluso tal como está, su historia merece ser descubierta. Tras un Don’t Knock Twice que ciertamente componía una gran atmósfera, el estudio Wales Interactive regresa este año con Maia of Sker, un título con el que buscan solventar los fallos del anterior al tiempo que contarnos una estupenda historia de horror cósmico - marítimo. El punto de partida es directo: Estás en el siglo XIX, en un 

In [77]:
author = soup.find('li', {'class': 'art-aut-wr'}).text.strip('\n')
author

'César Otero'

In [82]:
# Author
def meri_author():
    author = 'None'
    
    try:
        author = soup.find('p', {'class': 'art-aut-wr'}).find('a').text.strip('\n')

    except AttributeError:
        pass
    
    try:
        author = soup.find('li', {'class': 'art-aut-wr'}).text.strip('\n')

    except AttributeError:
        pass

    return author

In [83]:
meri_author()

'César Otero'

In [84]:
# Game

game = soup.find('div', {'class': 'ga-h-tl'}).text
game

'Maid of Sker'

In [85]:
# Genre, Company and Platform

infobox = soup.find('ul', {'class': 'li-inl'}).contents
for box in infobox:  
        
    if (type(box.find('span')) == int) or (box.find('span') == None):
        pass
    else:
        #print(box.find('span').text)
 
        if 'Plataforma' in box.find('span').text:
            group = box.find_all('a', {'class': 'rv-inline'})
            platform = [plat.contents[0].strip('\n') for plat in group]
            platform = ' '.join(platform)
            
        if 'Género' in box.find('span').text:
            genre = box.find('span', {'class': 'val'}).text
            
        if 'Editor' in box.find('span').text:
            company = box.find('span', {'class': 'val'}).text

print(platform, genre, company)

PC NSW PS4 XBO 
Aventura                     
Wales Interactive



In [94]:
# Score

def meri_score():
    #Initialize with most common score
    score = ''

    #3 different paths for score in Meristation
    try:
        score = soup.find('span', {'class': 'rv-sc sc-h'}).text

    except AttributeError:
        pass

    try:
        score = soup.find('span', {'class': 'rv-sc sc-m'}).text

    except AttributeError:
        pass

    try:
        score = soup.find('span', {'class': 'rv-sc sc-l'}).text

    except AttributeError:
        pass
   
    return score

In [95]:
meri_score()

'6.8'

## Create columns

In [8]:

def meristation_dict(links):
    reviews_dict = {}
    i = 0

    for link in links:
        try:
            
            #Request content and wait
            review_html = requests.get(link).content

            #Avoid get banned and timeout
            time.sleep(1)

            #Make a Soup and a Newspacer build
            soup = BeautifulSoup(review_html, 'lxml')
            #paper = newspaper.build(link) - Unnecesary

            #Site name - From Newspaper - Unnecesary
            #site = paper.brand

            #URL - From Newspaper - Unnecesary
            #url_link = paper.url

            #Game - From Scraping
            game = soup.find('div', {'class': 'ga-h-tl'}).text.strip('\n')

            #Author - From Scraping
            author = 'None'
    
            try:
                author = soup.find('p', {'class': 'art-aut-wr'}).find('a').text.strip('\n')

            except AttributeError:
                pass

            try:
                author = soup.find('li', {'class': 'art-aut-wr'}).text.strip('\n')

            except AttributeError:
                pass

            #Company, Genre & Platform - From Scraping
            genre = 'None'
            company = 'None'
            platform = 'None'

            infobox = soup.find('ul', {'class': 'li-inl'}).contents
            for box in infobox:  

                if (type(box.find('span')) == int) or (box.find('span') == None):
                    pass
                else:
                    #print(box.find('span').text)

                    if 'Plataforma' in box.find('span').text:
                        group = box.find_all('a', {'class': 'rv-inline'})
                        platform = [plat.contents[0].strip('\n') for plat in group]
                        platform = ' '.join(platform).strip('\n')

                    if 'Género' in box.find('span').text:
                        genre = box.find('span', {'class': 'val'}).text.strip('\n')

                    if 'Editor' in box.find('span').text:
                        company = box.find('span', {'class': 'val'}).text.strip('\n')

            #Text & Cleaning - From Scraping
            p_tags = soup.find('div', {'class': 'art-body'}).find_all('p')
            review = [tag.text for tag in p_tags]
            review = ' '.join(review)

            #Score & Clean & Transform - From Scraping
            score = 7

            #3 different paths for score in Meristation
            try:
                score = soup.find('span', {'class': 'rv-sc sc-h'}).text

            except AttributeError:
                pass

            try:
                score = soup.find('span', {'class': 'rv-sc sc-m'}).text

            except AttributeError:
                pass

            try:
                score = soup.find('span', {'class': 'rv-sc sc-l'}).text

            except AttributeError:
                pass

            score = float(score)

            #Add to a dict
            reviews_dict[i] = {'site': 'meristation',
                               'url_link': link,
                               'author': author,
                               'game': game,
                               'company': company,
                               'genre': genre,
                               'platform': platform,
                               'text': review,
                               'score': score}
        
        except AttributeError:
            pass
        
        i = i+1
        if i % 25 == 0:
            print(i, ': ', link)

    return reviews_dict

In [9]:
#result_meri = meristation_dict(links)


25 :  https://as.com/meristation/2016/04/26/analisis/1461657960_154988.html
50 :  https://as.com/meristation/2016/03/18/analisis/1458284400_154088.html
75 :  https://as.com/meristation/2016/02/12/analisis/1455269760_153184.html
100 :  https://as.com/meristation/2015/12/02/analisis/1449053460_151404.html
125 :  https://as.com/meristation/2015/10/30/analisis/1446188400_150266.html
150 :  https://as.com/meristation/2015/09/29/analisis/1443531600_149210.html
175 :  https://as.com/meristation/2015/08/26/analisis/1440597600_148419.html
200 :  https://as.com/meristation/2015/07/18/analisis/1437170400_147229.html
225 :  https://as.com/meristation/2015/06/05/analisis/1433527200_145768.html
250 :  https://as.com/meristation/2015/04/15/analisis/1429082340_144176.html
275 :  https://as.com/meristation/2015/03/17/analisis/1426579200_143109.html
300 :  https://as.com/meristation/2015/02/24/analisis/1424790000_142283.html
325 :  https://as.com/meristation/2015/02/02/analisis/1422860400_141307.html
35

In [11]:
print(result_meri[2850])

{'site': 'meristation', 'url_link': 'https://as.com/meristation/2008/07/23/analisis/1216792800_018610.html', 'author': 'Salva Fernàndez', 'game': 'Arkanoid DS', 'company': 'Square Enix', 'genre': 'Acción, Arcade                    ', 'platform': 'DS', 'text': "Hace unos 22 años, Taito creó uno de los juegos insignia del mundo de los videojuegos: Arkanoid.\xa0 El desarrollo era bastante sencillo. Una plataforma controlada por el jugador debía hacer rebotar una pelota con dos funciones: evitar que ésta saliera del mapeado e intentar vaciar el nivel de bloques molestos.  Tan sencillo como adictivo, Arkanoid basaba su tremenda jugabilidad en el diseño de niveles,\xa0 su dificultad, y la astucia a la hora de decidir que potenciadores se utilizaban en cada momento.\n\nAhora, acabando ya el 2008, sale la enésima revisión de uno de los juegos más manidos en las recreativas de la mano de Square Enix. Nintendo DS recibe la última adaptación de este clásico con más de 130 fases en el modo clásico

### Create DataFrame

In [12]:
meristation = pd.DataFrame.from_dict(result_meri, orient='index')

meristation

Unnamed: 0,site,url_link,author,game,company,genre,platform,text,score
0,meristation,https://as.com/meristation/2016/06/22/analisis...,,Mario y Sonic en los Juegos Olímpicos: Río 2016,,Deportes,WiiU 3DS,Para una generación que crecimos metiendo mone...,7.5
1,meristation,https://as.com/meristation/2016/06/21/analisis...,Fran García,Deadlight: Director's Cut,,"Acción, Aventura, Plataformas, Survival Horror...",XBO PC PS4,Habitualmente los videojuegos en los que el zo...,8.0
2,meristation,https://as.com/meristation/2016/06/21/analisis...,Francisco Alberto Serrano,The Banner Saga 2,,"Strategy, RPG, Turn Based, Tactical ...",PS4 PC OSX XBO,Banner Saga fue uno de los grandes triunfos na...,8.5
3,meristation,https://as.com/meristation/2016/06/20/analisis...,Salva Fernàndez,Mighty No. 9,,"Acción, Plataformas",XBO 360 PC PS4 PSV 3DS PS3 WiiU OSX,Mighty no.9 apuntaba a ser uno de los grandes...,5.0
4,meristation,https://as.com/meristation/2016/06/12/analisis...,Joaquín Relaño,Guilty Gear Xrd -REVELATOR-,Arc System Works,Acción,PS3 PS4 PC,Capcom sigue pidiendo perdón por el desastre q...,9.5
...,...,...,...,...,...,...,...,...,...
2995,meristation,https://as.com/meristation/2008/03/26/analisis...,Pablo González,Warriors Orochi,,Acción,360 PC,Koei supo mover ficha cuando decidió que no po...,7.0
2996,meristation,https://as.com/meristation/2008/03/26/analisis...,Ramón Méndez,El Universo en Guerra: Asalto a la Tierra,SEGA,"Strategy, Real-Time",PC 360,"A pesar de que, históricamente, la estrategia ...",8.0
2997,meristation,https://as.com/meristation/2008/03/24/analisis...,,Bomberman Land Touch 2,,"Acción, Arcade",DS,Cuando tenemos delante un juego con un dos al ...,6.0
2998,meristation,https://as.com/meristation/2008/03/24/analisis...,Javi Andrés,Harvest Moon Magical Melody,,RPG,Wii,"Abrimos la caja, metemos el juego y nos prepa...",7.0


In [13]:
#meristation.to_csv('../data/meristation_51_150p.csv', index=False)

In [14]:
len(result_meri)

2986