# Dataframe updater with new articles (sketch)

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

In [None]:
all_sites = pd.read_csv(data)

new_all = pd.concat([all_sites, up_vandal_df], ignore_index=True, sort=False)

## Vandal updater

In [None]:
def latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Vandal'].head(30)
    return filtered_df['url_link'].tolist()

In [None]:
#Links retriever v2 - break to skip duplicates

def vandal_link_retrieve(num_pages):
    
    links = []
    titles = []
    old_links = latest_links()
    
    # pages parser
    for i in range(num_pages):
        url = f"https://vandal.elespanol.com/analisis/videojuegos/inicio/{i*45}"

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('div', {'class': 'caja300 afterclearer'})
        
        # links & game titles retrieve. New: checking if old to stop adding raws.
        for a in articles:
            link = a.find('a')['href']
            
            if link not in old_links:     
                links.append(link) 
                titles.append(a.find('a')['title'])
            else:
                break
                
    return links, titles


In [None]:
'''
Import in this stage:
- vandal_platform(soup):
- vandal_genre(soup)
- vandal_company(soup)
- vandal_dict(links, titles)
'''

In [None]:
def vandal_dataframe(links, titles):
    result_vandal = vandal_dict(links, titles)
    up_vandal_df = pd.DataFrame.from_dict(result_vandal, orient='index')
    return up_vandal_df

## Gamereactor updater

In [19]:
def latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Gamereactor'].head(30)
    return filtered_df['url_link'].tolist()

In [29]:
def gamereactor_link_retrieve(num_pages):
    links = []
    titles = []
    old_links = latest_links(data)

    # pages parser
    for i in range(num_pages):
        url = f'https://www.gamereactor.es/analisis/?page={i + 1}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find('section', {'id': 'textlist'}).find_all('article')

        # links & game titles retrieve. New: checking if old to stop adding raws.
        for article in articles:
            link = links.append(f"https://www.gamereactor.es{article.find_all('a')[1]['href']}")
            
            if link not in old_links:     
                links.append(link) 
                titles.append(article.find('h3').text)
            else:
                break

    return links, titles

In [4]:
def gamereactor_dict(links, titles):
    reviews_dict = {}
    i = 0

    for link, title in zip(links, titles):
        try:
            # Request content - Avoid get banned - Make a Soup
            review_html = requests.get(link).content
            time.sleep(1)
            soup = BeautifulSoup(review_html, 'lxml')

            # Author - From Scraping
            author = soup.find('li', {'class': 'publishAuthor bullet'}).text

            # Company, Genre & Platform - From Scraping
            genre = 'None'
            company = 'None'
            platform = 'None'

            infobox = soup.find_all('ul', {'class': 'infobox'})[0].contents
            for box in infobox:

                if 'Probado en:' in box.contents[0].text or 'Plataforma:' in box.contents[0].text:
                    platform = box.contents[1]

                if 'Género:' in box.contents[0].text:
                    genre = box.find('a').text

                if 'Editor:' in box.contents[0].text:
                    company = box.find('a').text

            # Text & Cleaning - From Scraping
            article = soup.find('div', {'class': 'breadtext'}).find('div', {'id': 'page0'})
            p_tags = article.find_all('p')
            review = [tag.text for tag in p_tags]
            review = ' '.join(review)

            # Score & Clean & Transform - From Scraping
            score = soup.find('div', {'class': 'bigScoreWrapper'}).find('img')['alt']
            score = float(score)

            # Add to a dict
            reviews_dict[i] = {'site': 'Gamereactor',
                               'url_link': link,
                               'author': author,
                               'game': title,
                               'company': company,
                               'genre': genre,
                               'platform': platform,
                               'text': review,
                               'score': score}

        except AttributeError:
            pass

        i = i + 1
        if i % 25 == 0:
            print(i, ':', link)

    return reviews_dict



In [5]:
def gamereactor_dataframe(links, titles):
    result_revo = gamereactor_dict(links, titles)
    up_gamereactor_df = pd.DataFrame.from_dict(result_revo, orient='index')

    return up_gamereactor_df

In [30]:
data = f"../data/test2_2021.csv"
num_pages = 1
links, titles = gamereactor_link_retrieve(num_pages)

In [16]:
test_2021 = pd.read_csv("../data/test2_2021.csv")

In [34]:
latest_links(data)

['https://www.gamereactor.es/main-assembly-pc/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/dead-cells-fatal-falls-dlc/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/age-of-empires-ii-definitive-edition-lords-of-the-west/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/curse-of-the-dead-gods-analisis/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/the-medium-xbox-series-x-pc/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/blizzard-arcade-collection-pc-switch-ps4-xbox-one/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/little-nightmares-2-ps4-ps5-switch-xbox-pc/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/bravely-default-ii-analisis/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/la-coleccion-de-nioh/?sid=3d0782fb7b4cd2f6f5cc39b446d3adc6',
 'https://www.gamereactor.es/werewolf-the-apocalypse-earthblood-analisis/?sid=

In [35]:
links

['https://www.gamereactor.es/doom-3-vr-edition-analisis/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/star-wars-republic-commando-analisis/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/crash-bandicoot-on-the-run-analisis/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/outriders-analisis/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/mundaun-analisis/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/immortals-fenyx-rising-mitos-del-reino-del-este/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/loop-hero-pc/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/jurassic-world-aftermath-oculus-quest/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/doom-eternal-the-ancient-gods-segunda-parte/?sid=5d0a4f03c15720ba9766545160c11924',
 None,
 'https://www.gamereactor.es/evil-gen