# Dataframe updater with new articles (sketch)

In [3]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

In [None]:
all_sites = pd.read_csv(data)

new_all = pd.concat([all_sites, up_vandal_df], ignore_index=True, sort=False)

## Vandal updater

In [15]:
def vandal_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Vandal'].head(30)
    return filtered_df['url_link'].tolist()

In [20]:
#Links retriever v2 - break to skip duplicates

def vandal_link_retrieve(pages):
    
    links = []
    titles = []
    old_links = latest_links(data)
    repeated = False  
    
    # pages parser
    for i in range(pages):
        if repeated == True:
            break
            
        url = f"https://vandal.elespanol.com/analisis/videojuegos/inicio/{i*45}"

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('div', {'class': 'caja300 afterclearer'})
        
        # links & game titles retrieve. New: checking if old to stop adding raws.
        for a in articles:
            link = a.find('a')['href']
            
            if link not in old_links:     
                links.append(link) 
                titles.append(a.find('a')['title'])
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break
                
    return links, titles

In [None]:
'''
Import in this stage:
- vandal_platform(soup):
- vandal_genre(soup)
- vandal_company(soup)
- vandal_dict(links, titles)
'''

In [17]:
def vandal_dataframe(links, titles):
    result_vandal = vandal_dict(links, titles)
    up_vandal_df = pd.DataFrame.from_dict(result_vandal, orient='index')
    return up_vandal_df

In [22]:
#test
data = f"../data/test2_2021.csv"
pages = 2
vandal_link_retrieve(pages)

24 new games added.


(['https://vandal.elespanol.com/analisis/iphone/world-of-demons/60074',
  'https://vandal.elespanol.com/analisis/ps5/disco-elysium-the-final-cut/93992',
  'https://vandal.elespanol.com/analisis/pc/before-your-eyes/81894',
  'https://vandal.elespanol.com/analisis/pc/qomp/88395',
  'https://vandal.elespanol.com/analisis/ps4/paradise-lost/97468',
  'https://vandal.elespanol.com/analisis/pc/spacebase-startopia/76637',
  'https://vandal.elespanol.com/analisis/pc/record-of-lodoss-wardeedlit-in-wonder-labyrinth/82317',
  'https://vandal.elespanol.com/analisis/pc/lost-words-beyond-the-page/50885',
  'https://vandal.elespanol.com/analisis/ps5/balan-wonderworld/88212',
  'https://vandal.elespanol.com/analisis/pc/evil-genius-2-world-domination/4196',
  'https://vandal.elespanol.com/analisis/pc/narita-boy/47106',
  'https://vandal.elespanol.com/analisis/ps4/doom-3-vr-edition/97172',
  'https://vandal.elespanol.com/analisis/pc/evil-inside/92122',
  'https://vandal.elespanol.com/analisis/pc/genesis-

## Gamereactor updater

In [1]:
def gamereactor_latest_games(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Gamereactor'].head(30)
    return filtered_df['game'].tolist()

In [8]:
def gamereactor_link_retrieve(pages):
    links = []
    titles = []
    old_games = latest_games(data)
    repeated = False  

    # pages parser
    for i in range(pages):
        if repeated == True:
            break

        url = f'https://www.gamereactor.es/analisis/?page={i + 1}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find('section', {'id': 'textlist'}).find_all('article')

        # links & game titles retrieve. New: checking if old to stop adding raws.
        for article in articles:
            game = article.find('h3').text

            if game not in old_games:
                links.append(f"https://www.gamereactor.es{article.find_all('a')[1]['href']}") 
                titles.append(game)
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break

    return links, titles

In [None]:
'''
Import in this stage:
- gamereactor_dict(links, titles)
'''

In [12]:
def gamereactor_dataframe(links, titles):
    result_revo = gamereactor_dict(links, titles)
    up_gamereactor_df = pd.DataFrame.from_dict(result_revo, orient='index')

    return up_gamereactor_df

In [9]:
#test
data = f"../data/test2_2021.csv"
pages = 1
links, titles = gamereactor_link_retrieve(pages)

16 new games added.


## Revogamers updater

In [38]:
def revogamers_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'revogamers'].head(30)
    return filtered_df['url_link'].tolist()

In [36]:
#Links retriever v2 - break to skip duplicates

def revogamers_link_retrieve(pages):
    links = []
    titles = []
    old_links = revogamers_latest_links(data)
    repeated = False  

    # pages parser
    for i in range(pages):
        if repeated == True:
            break
            
        url = f'https://www.revogamers.net/analisis-w/page/{i + 1}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('h2')

        # links retrieve
        for a in articles:
            link = a.find('a')['href']
            
            if link not in old_links:     
                links.append(link) 
                titles.append(a.find('a')['title'])
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break
            
    # Deleting non review links
    for link in links:
        if 'analisis' not in link:
            links.remove(link)

    return links, titles

In [None]:
'''
Import in this stage:
- revogamers_dict(links, titles)
'''

In [23]:
def revogamers_dataframe(links, titles):
    result_revo = revogamers_dict(links, titles)
    up_revogamers_df = pd.DataFrame.from_dict(result_revo, orient='index')
    return up_revogamers_df

In [39]:
#test
data = f"../data/test2_2021.csv"
pages = 2
links, titles = revogamers_link_retrieve(pages)

16 new games added.


## meristation updater

In [43]:
def meristation_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'meristation'].head(30)
    return filtered_df['url_link'].tolist()

In [56]:
def meristation_link_retrieve(pages):
    links = []
    old_links = meristation_latest_links(data)
    repeated = False 

    # pages parser
    for i in range(pages):
        if repeated == True:
            break

        url = f'https://as.com/meristation/analisis/{337 - i}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('h2')

        # links & game titles retrieve. New: checking if old to stop adding raws.
        for a in articles:
                                  
            if a.find('a') == None:
                pass
            else:
                link = a.find('a')['href']
                
                if link not in old_links:
                    links.append(link)
                else:
                    repeated = True
                    break
                
    # Deleting non review links
    for link in links:
        if 'analisis' not in link:
            links.remove(link)

    print(links, f'{len(links)} new games added.')
    return links

In [51]:
'''
Import in this stage:
- meri_author(soup)
- meri_score(soup)
- meristation_dict(links)
'''

'\nImport in this stage:\n- meri_author(soup)\n- meri_score(soup)\n- meristation_dict(links)\n'

In [52]:
def meristation_dataframe(links):
    result_meri = meristation_dict(links)
    up_meristation_df = pd.DataFrame.from_dict(result_meri, orient='index')
    return up_meristation_df

In [57]:
#test
data = f"../data/test2_2021.csv"
pages = 2
links = meristation_link_retrieve(pages)

['https://as.com/meristation/2021/04/08/analisis/1617889421_063525.html', 'https://as.com/meristation/2021/04/07/analisis/1617809548_880093.html', 'https://as.com/meristation/2021/04/06/analisis/1617694738_423461.html', 'https://as.com/meristation/2021/04/05/analisis/1617623116_949416.html', 'https://as.com/meristation/2021/04/04/analisis/1617519009_342820.html', 'https://as.com/meristation/2021/04/02/analisis/1617351377_395107.html', 'https://as.com/meristation/2021/04/01/analisis/1617273286_559321.html', 'https://as.com/meristation/2021/04/01/analisis/1617261789_188289.html', 'https://as.com/meristation/2021/03/31/analisis/1617182479_101654.html', 'https://as.com/meristation/2021/03/31/analisis/1617174009_601566.html', 'https://as.com/meristation/2021/03/30/analisis/1617120558_078339.html', 'https://as.com/meristation/2021/03/30/analisis/1617120313_978999.html', 'https://as.com/meristation/2021/03/30/analisis/1617082693_230913.html', 'https://as.com/meristation/2021/03/29/analisis/16

In [49]:
meristation_latest_links(data)

['https://as.com/meristation/2020/09/13/analisis/1599985057_505947.html',
 'https://as.com/meristation/2020/09/19/analisis/1600499631_945194.html',
 'https://as.com/meristation/2020/09/30/analisis/1601454855_979194.html',
 'https://as.com/meristation/2020/09/26/analisis/1601109660_140327.html',
 'https://as.com/meristation/2020/09/22/analisis/1600760878_562894.html',
 'https://as.com/meristation/2020/10/03/analisis/1601701228_240717.html',
 'https://as.com/meristation/2020/10/01/analisis/1601557244_384472.html',
 'https://as.com/meristation/2020/09/18/analisis/1600418364_262891.html',
 'https://as.com/meristation/2020/09/26/analisis/1601118163_185827.html',
 'https://as.com/meristation/2020/09/20/analisis/1600588999_099915.html',
 'https://as.com/meristation/2020/09/16/analisis/1600261344_550903.html',
 'https://as.com/meristation/2020/09/11/analisis/1599814324_095879.html',
 'https://as.com/meristation/2020/09/15/analisis/1600166091_128370.html',
 'https://as.com/meristation/2020/09/2