# Scraping Vandal

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import newspaper
from newspaper import Article
from newspaper import Source
from newspaper import fulltext

## Scraping Articles Site 

In [4]:
review_url = 'https://vandal.elespanol.com/analisis/videojuegos/inicio/0'
html = requests.get(review_url).content
html[:50]

b'<!DOCTYPE html><html lang="es"><head>\n<!--[if lt I'

In [5]:
soup = BeautifulSoup(html, 'lxml')
articles = soup.find_all('div', {'class': 'caja300 afterclearer'})

In [7]:
#Extraer títulos

'''for a in articles:
    if a.find('a') == None:
        pass
    else:
        titles.append(a.find('a')['title'])'''

titles = [a.find('a')['title'] for a in articles]

titles[0:5]

['Budget Cuts - Análisis',
 'The Survivalists - Análisis',
 'Prinny 1 2: Exploded and Reloaded - Análisis',
 'Genshin Impact - Análisis',
 'FIFA 21 - Análisis']

In [8]:
#Extraer links

links = [a.find('a')['href'] for a in articles]
    
links[0:4]

['https://vandal.elespanol.com/analisis/ps4/budget-cuts/90876',
 'https://vandal.elespanol.com/analisis/pc/the-survivalists/80200',
 'https://vandal.elespanol.com/analisis/switch/prinny-1-2-exploded-and-reloaded/82812',
 'https://vandal.elespanol.com/analisis/ps4/genshin-impact/75962']

## Page parsing and link retrieving function for Vandal

In [2]:
def vandal_link_retrieve(num_pages):
    
    links = []
    titles = []
    
    # pages parser
    for i in range(num_pages):
        url = f"https://vandal.elespanol.com/analisis/videojuegos/inicio/{i*45}"

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('div', {'class': 'caja300 afterclearer'})
        
        # links & game titles retrieve
        for a in articles:
            links.append(a.find('a')['href']) 
            titles.append(a.find('a')['title'])
    
    #Deleting non review links
#    for link in links:
#        if 'analisis' not in link:
#            links.remove(link)
                
    return links, titles


In [3]:
num_pages = 100

links, titles = vandal_link_retrieve(num_pages)

In [4]:
len(links)

4500

## Scraping Single Review

In [2]:
review_url = 'https://vandal.elespanol.com/analisis/ps4/genshin-impact/75962#p-73'
review_html = requests.get(review_url).content
review_html[:50]

b'<!DOCTYPE html><html lang="es"><head>\n<!--[if lt I'

In [3]:
soup = BeautifulSoup(review_html, 'lxml')

In [4]:
p_tags = soup.find('div', {'class': 'textart'}).find_all('p')

In [5]:
# Text
review = [tag.text for tag in p_tags]
review = ' '.join(review).strip()

In [6]:
# Author
author = soup.find('span', {'class': 'reviewer'}).text
author

'Ramón Varela'

In [5]:
#Platform

def vandal_platform(soup):
    
    platform_soup = soup.find('td', {'class': 'tablaplataformas'})
    platform = [img["alt"] for img in platform_soup.select("img[alt]")]        
    platform = ' '.join(platform)
            
    return platform

In [34]:
vandal_platform(soup)

'PS4 PC iPhone Android Switch'

In [6]:
# Genre

def vandal_genre(soup):
    
    genre_soup = soup.find('div', {'class': 'mt1 tcenter t11'}).find_all('a')
    genre = [gs.text for gs in genre_soup]
    genre = ' '.join(genre)

    return genre

In [7]:
#Company

def vandal_company(soup):

    infobox = soup.find('ul', {'class': 'mt03 ulficha'})
    for box in infobox:
        if 'Producción: ' in box.contents: 
            company = box.find('a').text

            return company

In [10]:
#Score

score = soup.find('div', {'class': 'fichajuego mt03 tleft'}).text
score = float(score)
type(score)

float

## Create columns

In [8]:

def vandal_dict(links, titles):
    reviews_dict = {}
    i = 0

    for link, title in zip(links, titles):
        try:
            
            #Request content and wait
            review_html = requests.get(link).content

            #Avoid get banned and timeout
            time.sleep(1)

            #Make a Soup and a Newspacer build
            soup = BeautifulSoup(review_html, 'lxml')

            #Author - From Scraping
            author = soup.find('span', {'class': 'reviewer'}).text

            #Company - From Scraping 
            company = vandal_company(soup)
            
            #Genre - From Scraping
            genre = vandal_genre(soup)
            
            #Platform - From Scraping
            platform = vandal_platform(soup)

            #Text & Cleaning - From Scraping
            p_tags = soup.find('div', {'class': 'textart'}).find_all('p')
            review = [tag.text for tag in p_tags]
            review = ' '.join(review).strip()

            #Score & Clean & Transform - From Scraping
            score = soup.find('div', {'class': 'fichajuego mt03 tleft'}).text
            score = float(score)
            
            #Add to a dict
            reviews_dict[i] = {'site': 'Vandal',
                               'url_link': link,
                               'author': author,
                               'game': title,
                               'company': company,
                               'genre': genre,
                               'platform': platform,
                               'text': review,
                               'score': score}
        
        except (ValueError, AttributeError) as e:
            print('error', i)
            pass
        
        i = i+1
        if i % 25 == 0:
            print(i, ': ', link)

    return reviews_dict

In [9]:
result_vandal = vandal_dict(links[2225:], titles[2225:])

25 :  https://vandal.elespanol.com/analisis/ps4/wolfenstein-the-old-blood/29880
50 :  https://vandal.elespanol.com/analisis/ps4/assassins-creed-chronicles-china/26186
75 :  https://vandal.elespanol.com/analisis/ps4/rack-n-ruin/28481
100 :  https://vandal.elespanol.com/analisis/ps3/tokyo-twilight-ghost-hunters/25622
125 :  https://vandal.elespanol.com/analisis/pc/camera-obscura/29494
150 :  https://vandal.elespanol.com/analisis/3ds/the-legend-of-zelda-majoras-mask-3d/26601
175 :  https://vandal.elespanol.com/analisis/iphone/framed/27046
error 175
200 :  https://vandal.elespanol.com/analisis/ps4/lara-croft-and-the-temple-of-osiris/24740
225 :  https://vandal.elespanol.com/analisis/pc/the-sun-and-moon/26662
250 :  https://vandal.elespanol.com/analisis/xbone/halo-the-master-chief-collection/24726
275 :  https://vandal.elespanol.com/analisis/pc/sid-meiers-civilization-beyond-earth/24016
300 :  https://vandal.elespanol.com/analisis/ps4/driveclub/20530
error 312
325 :  https://vandal.elespano

In [10]:
result_vandal[1111]

{'site': 'Vandal',
 'url_link': 'https://vandal.elespanol.com/analisis/pc/simcity/15648',
 'author': 'Jorge Cano',
 'game': 'SimCity - Análisis',
 'company': 'EA Maxis',
 'genre': 'Construir ciudades Estrategia',
 'platform': 'PC Wii',
 'text': 'Han pasado 24 años desde que se lanzara el primer SimCity, un clásico con letras mayúsculas dentro de la historia de los videojuegos. En unos tiempos en los que están de moda los reinicios, remakes y reinterpretaciones de los clásicos, la vuelta de este simulador de construcción de ciudades era más que necesaria, diez años después de SimCity 4. Hemos escrito muchas impresiones sobre él en los últimos meses, todas muy entusiastas, el juego pintaba fenomenal, y su apuesta de base por el multijugador, con ciudades interconectadas que tienen que compartir recursos, nos parecía más que interesante. Sabíamos que exigiría conexión permanente a internet, algo que preveíamos iba a causar cierta polémica, lo que no imaginábamos era que esta característic

### Create DataFrame

In [11]:
vandal = pd.DataFrame.from_dict(result_vandal, orient='index')

vandal

Unnamed: 0,site,url_link,author,game,company,genre,platform,text,score
0,Vandal,https://vandal.elespanol.com/analisis/pc/sunse...,Ramón Nafria,Sunset - Análisis,Tale of Tales,Aventura,PC,En el exitoso Kickstarter de Sunset (casi 68.0...,8.0
1,Vandal,https://vandal.elespanol.com/analisis/wiiu/swo...,Carlos Leiva,Swords & Soldiers II eShop - Análisis,Ronimo Games,Estrategia en tiempo real Plataformas 2D,Wii U,La estrategia no es un género que tenga demasi...,7.5
2,Vandal,https://vandal.elespanol.com/analisis/pc/uncan...,Juan Rubio,Uncanny Valley - Análisis,Cowardly Creations,Aventura de acción Survival Horror Terror,PC PS4 PSVITA Xbox One Switch,El género del terror parece haber apostado por...,6.4
3,Vandal,https://vandal.elespanol.com/analisis/ps4/life...,Juan Rubio,Life is Strange - Episode 3 - Análisis,Square Enix,Aventura narrativa,PS4 PC Xbox One Xbox 360 PS3,Muchos de los que escribimos sobre videojuegos...,8.0
4,Vandal,https://vandal.elespanol.com/analisis/pc/spark...,Ramón Varela,Sparkle 3 Genesis - Análisis,Forever Entertainment,Acción,PC Switch,En 2006 el estudio thatgamecompany debutó con ...,6.5
...,...,...,...,...,...,...,...,...,...
2270,Vandal,https://vandal.elespanol.com/analisis/psp/phan...,Ramón Varela,Phantasy Star Portable 2 - Análisis,Sega,JRPG Rol,PSP,Sega ha vuelto a apostar por PSP para llevar s...,7.8
2271,Vandal,https://vandal.elespanol.com/analisis/ps3/tumb...,Víctor Moyano,Tumble - Análisis,Sony,PlayStation Move Puzle,PS3,Playstation Move ya está entre nosotros. El es...,7.0
2272,Vandal,https://vandal.elespanol.com/analisis/wii/trac...,Macarena Mey,TrackMania Wii - Análisis,Focus Home,Carreras arcade Velocidad,Wii,Aunque lo habitual cuando pensamos en un géner...,8.5
2273,Vandal,https://vandal.elespanol.com/analisis/pc/alien...,Jorge Cano,Alien Breed 2: Assault - Análisis,Team17,Shooter en tercera persona,PC Xbox 360 PS3,Hace unos diez meses se lanzaba Alien Breed: E...,6.0


In [13]:
#vandal.to_csv('../data/vandal_2225_4482l.csv', index=False)

In [12]:
len(vandal)

2257