# Scraping Vandal

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import newspaper
from newspaper import Article
from newspaper import Source
from newspaper import fulltext

## Scraping Articles Site 

In [4]:
review_url = 'https://vandal.elespanol.com/analisis/videojuegos/inicio/0'
html = requests.get(review_url).content
html[:50]

b'<!DOCTYPE html><html lang="es"><head>\n<!--[if lt I'

In [5]:
soup = BeautifulSoup(html, 'lxml')
articles = soup.find_all('div', {'class': 'caja300 afterclearer'})

In [7]:
#Extraer títulos

'''for a in articles:
    if a.find('a') == None:
        pass
    else:
        titles.append(a.find('a')['title'])'''

titles = [a.find('a')['title'] for a in articles]

titles[0:5]

['Budget Cuts - Análisis',
 'The Survivalists - Análisis',
 'Prinny 1 2: Exploded and Reloaded - Análisis',
 'Genshin Impact - Análisis',
 'FIFA 21 - Análisis']

In [8]:
#Extraer links

links = [a.find('a')['href'] for a in articles]
    
links[0:4]

['https://vandal.elespanol.com/analisis/ps4/budget-cuts/90876',
 'https://vandal.elespanol.com/analisis/pc/the-survivalists/80200',
 'https://vandal.elespanol.com/analisis/switch/prinny-1-2-exploded-and-reloaded/82812',
 'https://vandal.elespanol.com/analisis/ps4/genshin-impact/75962']

## Page parsing and link retrieving function for Vandal

In [37]:
def vandal_link_retrieve(num_pages):
    
    links = []
    titles = []
    
    # pages parser
    for i in range(num_pages):
        url = f"https://vandal.elespanol.com/analisis/videojuegos/inicio/{i*45}"

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('div', {'class': 'caja300 afterclearer'})
        
        # links & game titles retrieve
        for a in articles:
            links.append(a.find('a')['href']) 
            titles.append(a.find('a')['title'])
    
    #Deleting non review links
#    for link in links:
#        if 'analisis' not in link:
#            links.remove(link)
                
    return links, titles


In [38]:
num_pages = 100

links, titles = vandal_link_retrieve(num_pages)

In [39]:
len(links)

4500

## Scraping Single Review

In [2]:
review_url = 'https://vandal.elespanol.com/analisis/ps4/genshin-impact/75962#p-73'
review_html = requests.get(review_url).content
review_html[:50]

b'<!DOCTYPE html><html lang="es"><head>\n<!--[if lt I'

In [3]:
soup = BeautifulSoup(review_html, 'lxml')

In [4]:
p_tags = soup.find('div', {'class': 'textart'}).find_all('p')

In [5]:
# Text
review = [tag.text for tag in p_tags]
review = ' '.join(review).strip()

In [6]:
# Author
author = soup.find('span', {'class': 'reviewer'}).text
author

'Ramón Varela'

In [33]:
#Platform

def vandal_platform(soup):
    
    platform_soup = soup.find('td', {'class': 'tablaplataformas'})
    platform = [img["alt"] for img in platform_soup.select("img[alt]")]        
    platform = ' '.join(platform)
            
    return platform

In [34]:
vandal_platform(soup)

'PS4 PC iPhone Android Switch'

In [41]:
# Genre

def vandal_genre(soup):
    
    genre_soup = soup.find('div', {'class': 'mt1 tcenter t11'}).find_all('a')
    genre = [gs.text for gs in genre_soup]
    genre = ' '.join(genre)

    return genre

In [45]:
#Company

def vandal_company(soup):

    infobox = soup.find('ul', {'class': 'mt03 ulficha'})
    for box in infobox:
        if 'Producción: ' in box.contents: 
            company = box.find('a').text

            return company

In [10]:
#Score

score = soup.find('div', {'class': 'fichajuego mt03 tleft'}).text
score = float(score)
type(score)

float

## Create columns

In [50]:

def vandal_dict(links, titles):
    reviews_dict = {}
    i = 0

    for link, title in zip(links, titles):
        try:
            
            #Request content and wait
            review_html = requests.get(link).content

            #Avoid get banned and timeout
            time.sleep(1)

            #Make a Soup and a Newspacer build
            soup = BeautifulSoup(review_html, 'lxml')

            #Author - From Scraping
            author = soup.find('span', {'class': 'reviewer'}).text

            #Company - From Scraping 
            company = vandal_company(soup)
            
            #Genre - From Scraping
            genre = vandal_genre(soup)
            
            #Platform - From Scraping
            platform = vandal_platform(soup)

            #Text & Cleaning - From Scraping
            p_tags = soup.find('div', {'class': 'textart'}).find_all('p')
            review = [tag.text for tag in p_tags]
            review = ' '.join(review).strip()

            #Score & Clean & Transform - From Scraping
            score = soup.find('div', {'class': 'fichajuego mt03 tleft'}).text
            score = float(score)
            
            #Add to a dict
            reviews_dict[i] = {'site': 'Vandal',
                               'url_link': link,
                               'author': author,
                               'game': title,
                               'company': company,
                               'genre': genre,
                               'platform': platform,
                               'text': review,
                               'score': score}
        
        except (ValueError, AttributeError) as e:
            print('error', i)
            pass
        
        i = i+1
        if i % 25 == 0:
            print(i, ': ', link)

    return reviews_dict

In [51]:
result_vandal = vandal_dict(links[:2225], titles[:2225])

25 :  https://vandal.elespanol.com/analisis/ps4/efootball-pes-2021-season-update/87843
50 :  https://vandal.elespanol.com/analisis/pc/wasteland-3/42461
75 :  https://vandal.elespanol.com/analisis/pc/skater-xl/68625
100 :  https://vandal.elespanol.com/analisis/switch/story-of-seasons-friends-of-mineral-town/75081
125 :  https://vandal.elespanol.com/analisis/xbone/stellaris-console-edition/64531
150 :  https://vandal.elespanol.com/analisis/ps4/gorn/85496
175 :  https://vandal.elespanol.com/analisis/ps4/daymare-1998/68843
200 :  https://vandal.elespanol.com/analisis/ps4/resident-evil-3-remake/80190
225 :  https://vandal.elespanol.com/analisis/pc/hayfever/79193
250 :  https://vandal.elespanol.com/analisis/switch/kunai/81251
275 :  https://vandal.elespanol.com/analisis/xbone/weakless/76231
error 278
300 :  https://vandal.elespanol.com/analisis/pc/bug-fables/74021
325 :  https://vandal.elespanol.com/analisis/switch/pokemon-espada-y-escudo/61486
350 :  https://vandal.elespanol.com/analisis/pc

In [52]:
result_vandal[1111]

{'site': 'Vandal',
 'url_link': 'https://vandal.elespanol.com/analisis/ps4/just-dance-2018/49136',
 'author': 'Sara Borondo',
 'game': 'Just Dance 2018 - Análisis',
 'company': 'Ubisoft',
 'genre': 'Bailar Musical',
 'platform': 'PS4 Xbox One Xbox 360 PS3 Switch Wii U Wii',
 'text': 'Sacar durante ocho años una entrega anual de un videojuego es agotador, como bien saben los productores de algunas de las grandes sagas de la última década. Just Dance lleva saliendo desde 2009 y siempre logra ocupar puestos altos en las listas de ventas anuales. Aquel primer año el juego salió para Wii con cuatro pelucas para dejar claro su carácter fiestero y se alzó como el juego sorpresa de las navidades; tres millones de personas lo compraron y en España -pese a salir a la venta en noviembre- se convirtió en el quinto juego de Ubisoft más vendido aquel año. Desde entonces no se ha apeado de la lista de los 20 más vendidos cada año y se ha mantenido fiel a Wii. Pocos juegos se permiten tal cantidad de 

### Create DataFrame

In [53]:
vandal = pd.DataFrame.from_dict(result_vandal, orient='index')

vandal

Unnamed: 0,site,url_link,author,game,company,genre,platform,text,score
0,Vandal,https://vandal.elespanol.com/analisis/ps4/budg...,Juan Rubio,Budget Cuts - Análisis,Neat Corporation,Acción Realidad Virtual,PS4 PC,Sabemos que empezamos a sonar como un disco ra...,8.0
1,Vandal,https://vandal.elespanol.com/analisis/pc/the-s...,Manu Delgado,The Survivalists - Análisis,Team17 Digital Ltd,Construcción Supervivencia,PC Xbox One Switch PS4,Desde su lanzamiento en 2015 la franquicia The...,8.0
2,Vandal,https://vandal.elespanol.com/analisis/switch/p...,Sergio Martín,Prinny 1 2: Exploded and Reloaded - Análisis,NIS America,Hack and Slash Plataformas 2D,Switch,NIS es una de esas compañías que entre sus num...,8.2
3,Vandal,https://vandal.elespanol.com/analisis/ps4/gens...,Ramón Varela,Genshin Impact - Análisis,Mihoyo,JRPG Rol,PS4 PC iPhone Android Switch,Probablemente el día que las desarrolladoras c...,8.2
4,Vandal,https://vandal.elespanol.com/analisis/ps4/fifa...,Saúl González,FIFA 21 - Análisis,Electronic Arts,Deportes Fútbol,PS4 Xbox One PC Switch Xbox Series X PS5,"Un año más tenemos con nosotros, fiel a su cit...",8.5
...,...,...,...,...,...,...,...,...,...
2220,Vandal,https://vandal.elespanol.com/analisis/wiiu/spl...,Carlos Leiva,Splatoon - Análisis,Nintendo,Shooter en tercera persona Shooter multijugador,Wii U,Si hay alguien que todavía dude de la capacida...,8.5
2221,Vandal,https://vandal.elespanol.com/analisis/ps4/game...,Carlos Leiva,Game of Thrones: A Telltale Games Series - Epi...,Telltale Games,Aventura Gráfica Aventura narrativa,PS4 PC PS3 Xbox One Xbox 360,Esta semana los aficionados a Juego de Tronos ...,7.0
2222,Vandal,https://vandal.elespanol.com/analisis/pc/magne...,Ramón Varela,Magnetic: Cage Closed - Análisis,Gambitious,Plataformas de puzles,PC Xbox One PS4,Nos gustaría hablar de Magnetic: Cage Closed s...,7.0
2223,Vandal,https://vandal.elespanol.com/analisis/3ds/3d-t...,Ramón Nafria,3D Thunder Blade eShop - Análisis,Sega,Shoot'em up Shooter,Nintendo 3DS,Naoki Horii (M2) y Yosuke Okunari (SEGA) tiene...,7.5


In [54]:
#vandal.to_csv('../data/vandal_2225l.csv', index=False)

In [55]:
len(vandal)

2214