# Dataframe updater with new articles (sketch)

In [3]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

In [None]:
all_sites = pd.read_csv(data)

new_all = pd.concat([all_sites, up_vandal_df], ignore_index=True, sort=False)

## Vandal updater

In [15]:
def vandal_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Vandal'].head(30)
    return filtered_df['url_link'].tolist()

In [20]:
#Links retriever v2 - break to skip duplicates

def vandal_link_retrieve(pages):
    
    links = []
    titles = []
    old_links = latest_links(data)
    repeated = False  
    
    # pages parser
    for i in range(pages):
        if repeated == True:
            break
            
        url = f"https://vandal.elespanol.com/analisis/videojuegos/inicio/{i*45}"

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('div', {'class': 'caja300 afterclearer'})
        
        # links & game titles retrieve. New: checking if old to stop adding raws.
        for a in articles:
            link = a.find('a')['href']
            
            if link not in old_links:     
                links.append(link) 
                titles.append(a.find('a')['title'])
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break
                
    return links, titles

In [None]:
'''
Import in this stage:
- vandal_platform(soup):
- vandal_genre(soup)
- vandal_company(soup)
- vandal_dict(links, titles)
'''

In [17]:
def vandal_dataframe(links, titles):
    result_vandal = vandal_dict(links, titles)
    up_vandal_df = pd.DataFrame.from_dict(result_vandal, orient='index')
    return up_vandal_df

In [22]:
#test
data = f"../data/test2_2021.csv"
pages = 2
vandal_link_retrieve(pages)

24 new games added.


(['https://vandal.elespanol.com/analisis/iphone/world-of-demons/60074',
  'https://vandal.elespanol.com/analisis/ps5/disco-elysium-the-final-cut/93992',
  'https://vandal.elespanol.com/analisis/pc/before-your-eyes/81894',
  'https://vandal.elespanol.com/analisis/pc/qomp/88395',
  'https://vandal.elespanol.com/analisis/ps4/paradise-lost/97468',
  'https://vandal.elespanol.com/analisis/pc/spacebase-startopia/76637',
  'https://vandal.elespanol.com/analisis/pc/record-of-lodoss-wardeedlit-in-wonder-labyrinth/82317',
  'https://vandal.elespanol.com/analisis/pc/lost-words-beyond-the-page/50885',
  'https://vandal.elespanol.com/analisis/ps5/balan-wonderworld/88212',
  'https://vandal.elespanol.com/analisis/pc/evil-genius-2-world-domination/4196',
  'https://vandal.elespanol.com/analisis/pc/narita-boy/47106',
  'https://vandal.elespanol.com/analisis/ps4/doom-3-vr-edition/97172',
  'https://vandal.elespanol.com/analisis/pc/evil-inside/92122',
  'https://vandal.elespanol.com/analisis/pc/genesis-

## Gamereactor updater

In [1]:
def gamereactor_latest_games(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Gamereactor'].head(30)
    return filtered_df['game'].tolist()

In [8]:
def gamereactor_link_retrieve(pages):
    links = []
    titles = []
    old_games = latest_games(data)
    repeated = False  

    # pages parser
    for i in range(pages):
        if repeated == True:
            break

        url = f'https://www.gamereactor.es/analisis/?page={i + 1}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find('section', {'id': 'textlist'}).find_all('article')

        # links & game titles retrieve. New: checking if old to stop adding raws.
        for article in articles:
            game = article.find('h3').text

            if game not in old_games:
                links.append(f"https://www.gamereactor.es{article.find_all('a')[1]['href']}") 
                titles.append(game)
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break

    return links, titles

In [None]:
'''
Import in this stage:
- gamereactor_dict(links, titles)
'''

In [12]:
def gamereactor_dataframe(links, titles):
    result_revo = gamereactor_dict(links, titles)
    up_gamereactor_df = pd.DataFrame.from_dict(result_revo, orient='index')

    return up_gamereactor_df

In [9]:
#test
data = f"../data/test2_2021.csv"
pages = 1
links, titles = gamereactor_link_retrieve(pages)

16 new games added.


## Revogamers updater

In [38]:
def revogamers_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'revogamers'].head(30)
    return filtered_df['url_link'].tolist()

In [36]:
#Links retriever v2 - break to skip duplicates

def revogamers_link_retrieve(pages):
    links = []
    titles = []
    old_links = revogamers_latest_links(data)
    repeated = False  

    # pages parser
    for i in range(pages):
        if repeated == True:
            break
            
        url = f'https://www.revogamers.net/analisis-w/page/{i + 1}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('h2')

        # links retrieve
        for a in articles:
            link = a.find('a')['href']
            
            if link not in old_links:     
                links.append(link) 
                titles.append(a.find('a')['title'])
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break
            
    # Deleting non review links
    for link in links:
        if 'analisis' not in link:
            links.remove(link)

    return links, titles

In [None]:
'''
Import in this stage:
- revogamers_dict(links, titles)
'''

In [23]:
def revogamers_dataframe(links, titles):
    result_revo = revogamers_dict(links, titles)
    up_revogamers_df = pd.DataFrame.from_dict(result_revo, orient='index')
    return up_revogamers_df

In [39]:
#test
data = f"../data/test2_2021.csv"
pages = 2
links, titles = revogamers_link_retrieve(pages)

16 new games added.


In [40]:
titles

['Análisis de Say No! More',
 'Análisis de Plants vs. Zombies: La Batalla de Neighborville Edición Completa',
 'Análisis de Balan Wonderworld',
 'Análisis de Genesis Noir',
 'Análisis de Cozy Grove',
 'Análisis de Lost Words: Beyond the Page',
 'Análisis de Overcooked! All You Can Eat',
 'Análisis de NEOGEO POCKET COLOR SELECTION Vol.1',
 'Análisis de Darq: Complete Edition',
 'Análisis de Dreaming Sarah',
 'Análisis de Stubbs the Zombie in Rebel Without a Pulse',
 'Análisis de Monster Hunter Rise',
 'Análisis de Thunderflash',
 'Análisis de Crash Bandicoot 4: It’s About Time',
 'Análisis de Harvest Moon: Un Mundo Único',
 'Análisis de Cyanide & Happiness – Freakpocalypse']

In [41]:
revogamers_latest_links(data)

['https://www.revogamers.net/analisis-w/analisis-de-capcom-arcade-stadium-65689/',
 'https://www.revogamers.net/analisis-w/analisis-de-bravely-default-ii-65434/',
 'https://www.revogamers.net/analisis-w/analisis-de-little-nightmares-2-65238/',
 'https://www.revogamers.net/analisis-w/analisis-de-steamroll-65468/',
 'https://www.revogamers.net/analisis-w/analisis-kingdoms-of-amalur-re-reckoning-65899/',
 'https://www.revogamers.net/analisis-w/analisis-de-a-trainall-aboardtourism-65805/',
 'https://www.revogamers.net/analisis-w/analisis-de-deemo-reborn-65444/',
 'https://www.revogamers.net/analisis-w/analisis-de-persona-5-strikers-65263/',
 'https://www.revogamers.net/analisis-w/analisis-de-littlewood-65648/',
 'https://www.revogamers.net/analisis-w/analisis-de-ghosts-n-goblins-resurrection-65451/',
 'https://www.revogamers.net/analisis-w/analisis-de-mail-mole-65758/',
 'https://www.revogamers.net/analisis-w/analisis-de-romance-of-the-three-kingdoms-xiv-65449/',
 'https://www.revogamers.n