# Scraping GAMEREACTOR

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import newspaper
from newspaper import Article
from newspaper import Source
from newspaper import fulltext

## Scraping Articles Site 

In [2]:
url = 'https://www.gamereactor.es/analisis/'
html = requests.get(url).content
html[:500]

b' <!DOCTYPE html>\n<html lang="es">\n<head>\n\n\n<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({\'gtm.start\':\n\tnew Date().getTime(),event:\'gtm.js\'});var f=d.getElementsByTagName(s)[0],\n\tj=d.createElement(s),dl=l!=\'dataLayer\'?\'&l=\'+l:\'\';j.async=true;j.src=\n\t\'https://www.googletagmanager.com/gtm.js?id=\'+i+dl;f.parentNode.insertBefore(j,f);\n\t})(window,document,\'script\',\'dataLayer\',\'GTM-P9ZBFRD\');</script>\n\n\n<meta charset="utf-8">\n<meta name="viewport" content="width=device-width, initial-scale=1.0,'

In [3]:
soup = BeautifulSoup(html, 'lxml')
articles = soup.find('section', {'id':'textlist'}).find_all('article')

In [4]:
#Extraer títulos

titles = [article.find('h3').text for article in articles]
    
titles[0:3]

['Squad',
 'Super Mario Bros. 35 - Battle Royale',
 "Crash Bandicoot 4: It's About Time"]

In [5]:
#Extraer links

links = [f"https://www.gamereactor.es{article.find_all('a')[1]['href']}" for article in articles]

print(links[0:5])


['https://www.gamereactor.es/squad-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/super-mario-bros-35-battle-royale/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/crash-bandicoot-4-its-about-time-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/star-wars-squadrons-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/art-of-rally-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699']


In [6]:
#Not totally useful, discarded

authors = []
for article in articles:
    full_authors = article.find_all('a', {'class': 'username'})
    for author in full_authors:
        authors.append(author.text)
    
authors[0:3]

['Mike Holmes', 'Sergio Figueroa', 'Eirik Hyldbakk Furu']

## Page parsing and link retrieving function for Gamereactor

In [15]:
def gamereactor_link_retrieve(num_pages):
    links = []
    titles = []
    
    # pages parser
    for i in range(num_pages):
        url = f'https://www.gamereactor.es/analisis/?page={i+1}'

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find('section', {'id':'textlist'}).find_all('article')
    

        # links retrieve
        for article in articles:
            links.append(f"https://www.gamereactor.es{article.find_all('a')[1]['href']}")
            titles.append(article.find('h3').text)
        
    #print(links)           
    return links, titles
    


In [57]:
num_pages = 25

links, titles = gamereactor_link_retrieve(num_pages)

In [74]:
print(links[100], titles[1000])

https://www.gamereactor.es/saints-row-the-third-remastered-el-remaster-del-relanzamiento/?sid=bee34d403d368deaf73a574120523360 Dark Souls III: Ashes of Ariandel


## Scraping Single Review

In [42]:
review_url = 'https://www.gamereactor.es/star-wars-squadrons-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699'
review_html = requests.get(review_url).content
review_html[:50]

b' <!DOCTYPE html>\n<html lang="es">\n<head>\n\n\n<script'

In [43]:
soup = BeautifulSoup(review_html, 'lxml')

In [44]:
#Text + cleaning
article = soup.find('div', {'class': 'breadtext'}).find('div')
p_tags = article.find_all('p')
review = [tag.text for tag in p_tags]
review = ' '.join(review)

In [45]:
review[0:50]

'Squadrons es una nueva esperanza de EA para sacarl'

In [46]:
# Author

author = soup.find('li', {'class': 'publishAuthor bullet'}).text
                   
author

'Mike Holmes'

In [49]:
# Game

News_article = Article(review_url)
News_article.download()
News_article.parse()
News_article.title


'[ANÁLISIS] Star Wars: Squadrons'

In [51]:
# Genre, Company and Platform

infobox = soup.find_all('ul', {'class': 'infobox'})[0].contents

for box in infobox:
    
    if 'Probado en:' in box.contents[0].text or 'Plataforma:' in box.contents[0].text:
        platform = box.contents[1]
        
    if 'Género:' in box.contents[0].text:
        genre = box.find('a').text
        
    if 'Editor:' in box.contents[0].text:
        company = box.find('a').text

print(platform, genre, company)        


 PC, PS4, Xbox One Acción Electronic Arts


In [137]:
# Score

score = soup.find('div', {'class': 'bigScoreWrapper'}).find('img')['alt']
score = float(score)

In [138]:
score

8.0

## Create columns

In [62]:

def gamereactor_dict(links, titles):
    reviews_dict = {}
    i = 0
    

    for link, title in zip(links, titles):
        try:
            #Request content and wait
            review_html = requests.get(link).content

            #Avoid get banned and timeout
            time.sleep(2)

            #Make a Soup and a Newspaper paper & build
            soup = BeautifulSoup(review_html, 'lxml')
            #News_article = Article(link) - Unnecesary
            #News_article.download() - Unnecesary
            #News_article.parse() - Unnecesary
            #paper = newspaper.build(link) - Unnecesary

            #Site name - From Newspaper
            #site = paper.brand - Unnecesary

            #URL - From Newspaper
            #url_link = paper.url - Unnecesary

            #Author - From Scraping
            author = author = soup.find('li', {'class': 'publishAuthor bullet'}).text

            #Game - From Scraping
            #game = News_article.title
            game = title

            #Company, Genre & Platform - From Scraping

            genre = 'None'
            company = 'None'
            platform = 'None'

            infobox = soup.find_all('ul', {'class': 'infobox'})[0].contents
            for box in infobox:

                if 'Probado en:' in box.contents[0].text or 'Plataforma:' in box.contents[0].text:
                    platform = box.contents[1]

                if 'Género:' in box.contents[0].text:
                    genre = box.find('a').text

                if 'Editor:' in box.contents[0].text:
                    company = box.find('a').text


            #Text & Cleaning - From Scraping
            article = soup.find('div', {'class': 'breadtext'}).find('div')
            p_tags = article.find_all('p')
            review = [tag.text for tag in p_tags]
            review = ' '.join(review)

            #Score & Clean & Transform - From Scraping
            score = soup.find('div', {'class': 'bigScoreWrapper'}).find('img')['alt']
            score = float(score)

            #Add to a dict
            reviews_dict[i] = {'site': 'Gamereactor',
                               'url_link': link,
                               'author': author,
                               'game': game,
                               'company': company,
                               'genre': genre,
                               'platform': platform,
                               'text': review,
                               'score': score}

        except AttributeError:
            pass
        
        i = i+1
        if i % 25 == 0:
            print(i, ':', link)


    return reviews_dict

In [71]:
result = gamereactor_dict(links[500:1249], titles[500:1249])


25 : https://www.gamereactor.es/astro-bot-rescue-mission-analisis/?sid=bbd80d880e1bba0065ee03c3d69bc476
50 : https://www.gamereactor.es/two-point-hospital-analisis/?sid=bbd80d880e1bba0065ee03c3d69bc476
75 : https://www.gamereactor.es/this-is-the-police-2-analisis/?sid=2a74333ba3dabf428ac6acd24b1ccfe7
100 : https://www.gamereactor.es/wolfenstein-ii-the-new-colossus-para-nintendo-switch/?sid=2a74333ba3dabf428ac6acd24b1ccfe7
125 : https://www.gamereactor.es/dark-souls-remastered-analisis/?sid=d81cf114c16851b1ee8b4dc6149daff9
150 : https://www.gamereactor.es/doki-doki-literature-club-analisis/?sid=d81cf114c16851b1ee8b4dc6149daff9
175 : https://www.gamereactor.es/surviving-mars-analisis/?sid=24fa53fc79d98641b6158eb25f56137f
200 : https://www.gamereactor.es/strikers-edge-analisis/?sid=24fa53fc79d98641b6158eb25f56137f
225 : https://www.gamereactor.es/the-inpatient-analisis/?sid=f3727d896dd60aec8f1d71529ca1f1f3
250 : https://www.gamereactor.es/xenoblade-chronicles-2-analisis/?sid=f3727d896dd60

### Create DataFrame

In [72]:
gamereactor = pd.DataFrame.from_dict(result, orient='index')

gamereactor

Unnamed: 0,site,url_link,author,game,company,genre,platform,text,score
0,Gamereactor,https://www.gamereactor.es/rimworld-analisis/?...,Mike Holmes,RimWorld,Ludeon Studios,Estrategia,PC,Puede llegar a ser difícil saber cuándo has vi...,10.0
1,Gamereactor,https://www.gamereactor.es/castlevania-requiem...,Roy Woodhouse,Castlevania Requiem: Symphony of the Night & R...,Konami,Acción,PS4,Coincidiendo con el estreno de la segunda temp...,7.0
2,Gamereactor,https://www.gamereactor.es/diablo-3-para-ninte...,Sergio Figueroa,Diablo 3 para Nintendo Switch,Blizzard Entertainment,RPG,Nintendo Switch,Como una niña con zapatos nuevos se ve a Ninte...,9.0
3,Gamereactor,https://www.gamereactor.es/reigns-juego-de-tro...,Bengt Lemne,Reigns: Juego de Tronos,Devolver Digital,Estrategia,iOS,Reigns fue un juego de toma de decisiones trem...,8.0
4,Gamereactor,https://www.gamereactor.es/call-of-cthulhu-ana...,Bengt Lemne,Call of Cthulhu,Focus Home Interactive,Aventura,"Xbox One, PS4, PC, Nintendo Switch",Han sido demasiados años sin tener buenos refe...,7.0
5,Gamereactor,https://www.gamereactor.es/nickelodeon-kart-ra...,Sam Bishop,Nickelodeon Kart Racers,GameMill,Carreras,"PS4, Nintendo Switch, Xbox One",Nickelodeon es una compañía que se ha ganado l...,3.0
6,Gamereactor,https://www.gamereactor.es/do-not-feed-the-mon...,Juan A. Fonseca,Do Not Feed the Monkeys,BadLand Games Publishing,Aventura,PC,"Vas al zoo, ves a los animales haciendo sus vi...",8.0
7,Gamereactor,https://www.gamereactor.es/red-dead-redemption...,Magnus Groth-Andersen,Red Dead Redemption 2,Rockstar,Acción,"PS4, Xbox One","De vez en cuando, nuestra percepción de la nar...",10.0
8,Gamereactor,https://www.gamereactor.es/thronebreaker-the-w...,Lisa Dahlgren,Thronebreaker: The Witcher Tales,CD Projekt Red,RPG,"PC, PS4, Xbox One","El estudio CD Projekt Red, el cerebro detrás d...",9.0
9,Gamereactor,https://www.gamereactor.es/warriors-orochi-4-a...,Sam Bishop,Warriors Orochi 4,Koei Tecmo,Acción,"PS4, Nintendo Switch, PC, Xbox One",Se puede decir con seguridad que Dynasty Warri...,6.0


In [73]:
#gamereactor.to_csv('../data/gamereactor_500_1249l.csv', index=False)