## Scraping GAMEREACTOR

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import newspaper
from newspaper import Article
from newspaper import Source
from newspaper import fulltext

## Scraping Articles Site 

In [2]:
url = 'https://www.gamereactor.es/analisis/'
html = requests.get(url).content
html[:500]

b' <!DOCTYPE html>\n<html lang="es">\n<head>\n\n\n<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({\'gtm.start\':\n\tnew Date().getTime(),event:\'gtm.js\'});var f=d.getElementsByTagName(s)[0],\n\tj=d.createElement(s),dl=l!=\'dataLayer\'?\'&l=\'+l:\'\';j.async=true;j.src=\n\t\'https://www.googletagmanager.com/gtm.js?id=\'+i+dl;f.parentNode.insertBefore(j,f);\n\t})(window,document,\'script\',\'dataLayer\',\'GTM-P9ZBFRD\');</script>\n\n\n<meta charset="utf-8">\n<meta name="viewport" content="width=device-width, initial-scale=1.0,'

In [3]:
soup = BeautifulSoup(html, 'lxml')
articles = soup.find('section', {'id':'textlist'}).find_all('article')

In [4]:
#Extraer títulos

titles = [article.find('h3').text for article in articles]
    
titles[0:3]

['Squad',
 'Super Mario Bros. 35 - Battle Royale',
 "Crash Bandicoot 4: It's About Time"]

In [5]:
#Extraer links

links = [f"https://www.gamereactor.es{article.find_all('a')[1]['href']}" for article in articles]

print(links[0:5])


['https://www.gamereactor.es/squad-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/super-mario-bros-35-battle-royale/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/crash-bandicoot-4-its-about-time-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/star-wars-squadrons-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699', 'https://www.gamereactor.es/art-of-rally-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699']


In [6]:
#Not totally useful, discarded

authors = []
for article in articles:
    full_authors = article.find_all('a', {'class': 'username'})
    for author in full_authors:
        authors.append(author.text)
    
authors[0:3]

['Mike Holmes', 'Sergio Figueroa', 'Eirik Hyldbakk Furu']

## Page parsing and link retrieving function for Gamereactor

In [2]:
def gamereactor_link_retrieve(num_pages):
    links = []
    titles = []
    
    # pages parser
    for i in range(num_pages):
        url = f'https://www.gamereactor.es/analisis/?page={i+26}'

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find('section', {'id':'textlist'}).find_all('article')
    
        # links retrieve
        for article in articles:
            links.append(f"https://www.gamereactor.es{article.find_all('a')[1]['href']}")
            titles.append(article.find('h3').text)
        
    #print(links)           
    return links, titles

In [3]:
num_pages = 21

links, titles = gamereactor_link_retrieve(num_pages)

## Scraping Single Review

In [42]:
review_url = 'https://www.gamereactor.es/star-wars-squadrons-analisis/?sid=861df4c8cdc5927d8dfd6d1e7bb4f699'
review_html = requests.get(review_url).content
review_html[:50]

b' <!DOCTYPE html>\n<html lang="es">\n<head>\n\n\n<script'

In [43]:
soup = BeautifulSoup(review_html, 'lxml')

In [44]:
#Text + cleaning
article = soup.find('div', {'class': 'breadtext'}).find('div')
p_tags = article.find_all('p')
review = [tag.text for tag in p_tags]
review = ' '.join(review)

In [45]:
review[0:50]

'Squadrons es una nueva esperanza de EA para sacarl'

In [46]:
# Author

author = soup.find('li', {'class': 'publishAuthor bullet'}).text
                   
author

'Mike Holmes'

In [49]:
# Game

News_article = Article(review_url)
News_article.download()
News_article.parse()
News_article.title


'[ANÁLISIS] Star Wars: Squadrons'

In [51]:
# Genre, Company and Platform

infobox = soup.find_all('ul', {'class': 'infobox'})[0].contents

for box in infobox:
    
    if 'Probado en:' in box.contents[0].text or 'Plataforma:' in box.contents[0].text:
        platform = box.contents[1]
        
    if 'Género:' in box.contents[0].text:
        genre = box.find('a').text
        
    if 'Editor:' in box.contents[0].text:
        company = box.find('a').text

print(platform, genre, company)        


 PC, PS4, Xbox One Acción Electronic Arts


In [137]:
# Score

score = soup.find('div', {'class': 'bigScoreWrapper'}).find('img')['alt']
score = float(score)

In [138]:
score

8.0

## Create columns

In [13]:

def gamereactor_dict(links, titles):
    reviews_dict = {}
    i = 0
    

    for link, title in zip(links, titles):
        try:
            #Request content and wait
            review_html = requests.get(link).content

            #Avoid get banned and timeout
            time.sleep(2)

            #Make a Soup and a Newspaper paper & build
            soup = BeautifulSoup(review_html, 'lxml')
            #News_article = Article(link) - Unnecesary
            #News_article.download() - Unnecesary
            #News_article.parse() - Unnecesary
            #paper = newspaper.build(link) - Unnecesary

            #Site name - From Newspaper
            #site = paper.brand - Unnecesary

            #URL - From Newspaper
            #url_link = paper.url - Unnecesary

            #Author - From Scraping
            author = soup.find('li', {'class': 'publishAuthor bullet'}).text

            #Game - From Scraping
            #game = News_article.title
            game = title

            #Company, Genre & Platform - From Scraping

            genre = 'None'
            company = 'None'
            platform = 'None'

            infobox = soup.find_all('ul', {'class': 'infobox'})[0].contents
            for box in infobox:

                if 'Probado en:' in box.contents[0].text or 'Plataforma:' in box.contents[0].text:
                    platform = box.contents[1]

                if 'Género:' in box.contents[0].text:
                    genre = box.find('a').text

                if 'Editor:' in box.contents[0].text:
                    company = box.find('a').text


            #Text & Cleaning - From Scraping
            article = soup.find('div', {'class': 'breadtext'}).find('div')
            p_tags = article.find_all('p')
            review = [tag.text for tag in p_tags]
            review = ' '.join(review)

            #Score & Clean & Transform - From Scraping
            score = soup.find('div', {'class': 'bigScoreWrapper'}).find('img')['alt']
            score = float(score)

            #Add to a dict
            reviews_dict[i] = {'site': 'Gamereactor',
                               'url_link': link,
                               'author': author,
                               'game': game,
                               'company': company,
                               'genre': genre,
                               'platform': platform,
                               'text': review,
                               'score': score}

        except AttributeError:
            pass
        
        i = i+1
        if i % 25 == 0:
            print(i, ':', link)


    return reviews_dict

In [14]:
#result = gamereactor_dict(links[3:], titles[3:])

25 : https://www.gamereactor.es/lego-dimensions-analisis/?sid=87156f03e2862d810c253c7147e123e0
50 : https://www.gamereactor.es/everybodys-gone-to-the-rapture-analisis/?sid=2e1f4c198c797b64de0650a4ff00d285
75 : https://www.gamereactor.es/halo-3-odst-para-xbox-one/?sid=2e1f4c198c797b64de0650a4ff00d285
100 : https://www.gamereactor.es/convoy-analisis/?sid=a24a6edae5446e2906db97d9a8ecd2b7
125 : https://www.gamereactor.es/assassins-creed-rogue-para-pc/?sid=a24a6edae5446e2906db97d9a8ecd2b7
150 : https://www.gamereactor.es/far-cry-4-escapa-de-la-prision-de-durgesh/?sid=ad8b53687a650d1408e49fb725cc67a6
175 : https://www.gamereactor.es/super-smash-bros-for-wii-u-analisis/?sid=ad8b53687a650d1408e49fb725cc67a6
200 : https://www.gamereactor.es/sunset-overdrive-analisis/?sid=fdb839c83fda7b89f0698ce5c16226aa
225 : https://www.gamereactor.es/wasteland-2-analisis/?sid=fdb839c83fda7b89f0698ce5c16226aa
250 : https://www.gamereactor.es/oddworld-newntasty-analisis/?sid=750877eda9be5a7b3590c34cbf5dcdb2
275

### Create DataFrame

In [15]:
gamereactor = pd.DataFrame.from_dict(result, orient='index')

gamereactor

Unnamed: 0,site,url_link,author,game,company,genre,platform,text,score
0,Gamereactor,https://www.gamereactor.es/dishonored-definiti...,Fabrizia Malgieri,Dishonored: Definitive Edition,Bethesda Softworks,Acción,"PC, PS3, PS4, Xbox 360, Xbox One","Ya sea en literatura o en formato audiovisual,...",8.0
1,Gamereactor,https://www.gamereactor.es/life-is-strange-tem...,Bengt Lemne,Life is Strange - Temporada completa,Square Enix,Aventura,"PC, Xbox One, Xbox 360, PS4, PS3, iOS, Android",Ahora que ha salido el episodio final de Life ...,9.0
2,Gamereactor,https://www.gamereactor.es/warhammer-the-end-t...,Cecilia Fjällström,Warhammer: The End Times - Vermintide,Fatshark,Acción,"PC, PS4, Xbox One",Un rugido furioso resuena por los estrechos ca...,9.0
3,Gamereactor,https://www.gamereactor.es/assassins-creed-syn...,Ricardo C. Esteves,Assassin's Creed: Syndicate,Ubisoft,Acción,"PC, PS4, Xbox One","Al parecer, sí que se puede tener demasiado de...",8.0
4,Gamereactor,https://www.gamereactor.es/the-legend-of-zelda...,Fabrizia Malgieri,The Legend of Zelda: Tri Force Heroes,Nintendo,Acción,Nintendo 3DS,The Legend of Zelda siempre ha destacado en el...,8.0
...,...,...,...,...,...,...,...,...,...
1008,Gamereactor,https://www.gamereactor.es/gran-turismo-5-anal...,Petter Hegevall (Gamereactor Suecia),Gran Turismo 5,Sony,Carreras,PS3,Vaya pedazo de espera. Cinco años y ocho meses...,7.0
1009,Gamereactor,https://www.gamereactor.es/dance-central-anali...,Jonas Elfving (Gamereactor Suecia),Dance Central,Microsoft,Party Game,Xbox 360,"Lo creáis o no, solía ser todo un bailarín en ...",8.0
1010,Gamereactor,https://www.gamereactor.es/disney-epic-mickey-...,Rasmus Lund-Hansen (Gamereactor Dinamarca),Disney Epic Mickey,Disney Interactive Studios,Plataformas,Wii,Si me hubieras preguntado mi opinión sobre Dis...,9.0
1011,Gamereactor,https://www.gamereactor.es/assassins-creed-la-...,Petter Mårtensson,Assassin's Creed: La Hermandad,Ubisoft,Acción,"PC, PS3, Xbox 360, Mac",Cuando se redactó este artículo no habíamos po...,9.0


In [16]:
#gamereactor.to_csv('../data/gamereactor_1250_last.csv', index=False)