# Dataframe updater with new articles (sketch)

In [26]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

## Vandal updater

In [35]:
def vandal_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Vandal'].head(40)
    return filtered_df['url_link'].tolist()

In [36]:
#Links retriever v2 - break to skip duplicates

def vandal_link_retrieve(pages):
    
    links = []
    titles = []
    old_links = vandal_latest_links(data)
    repeated = False  
    
    # pages parser
    for i in range(pages):
        if repeated == True:
            break
            
        url = f"https://vandal.elespanol.com/analisis/videojuegos/inicio/{i*45}"

        # building soup    
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('div', {'class': 'caja300 afterclearer'})
        
        # links & game titles retrieve. New: checking if old to stop adding raws.
        for a in articles:
            link = a.find('a')['href']
            
            if link not in old_links:     
                links.append(link) 
                titles.append(a.find('a')['title'])
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break
                
    return links, titles

In [37]:
'''
Import in this stage:
- vandal_platform(soup):
- vandal_genre(soup)
- vandal_company(soup)
- vandal_dict(links, titles)
'''

'\nImport in this stage:\n- vandal_platform(soup):\n- vandal_genre(soup)\n- vandal_company(soup)\n- vandal_dict(links, titles)\n'

In [38]:
def vandal_dataframe(links, titles):
    result_vandal = vandal_dict(links, titles)
    up_vandal_df = pd.DataFrame.from_dict(result_vandal, orient='index')
    return up_vandal_df

In [53]:
#test
data = f"../data/up_to_date_all.csv"
pages = 1
vandal_link_retrieve(pages)

1 new games added.


(['https://vandal.elespanol.com/analisis/pc/cozy-grove/92256'],
 ['Cozy Grove - Análisis'])

## Gamereactor updater

In [41]:
def gamereactor_latest_games(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'Gamereactor'].head(30)
    return filtered_df['game'].tolist()

In [46]:
def gamereactor_link_retrieve(pages):
    links = []
    titles = []
    old_games = gamereactor_latest_games(data)
    repeated = False  

    # pages parser
    for i in range(pages):
        if repeated == True:
            break

        url = f'https://www.gamereactor.es/analisis/?page={i + 1}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find('section', {'id': 'textlist'}).find_all('article')

        # links & game titles retrieve. New: checking if old to stop adding raws.
        for article in articles:
            game = article.find('h3').text

            if game not in old_games:
                links.append(f"https://www.gamereactor.es{article.find_all('a')[1]['href']}") 
                titles.append(game)
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break

    return links, titles

In [43]:
'''
Import in this stage:
- gamereactor_dict(links, titles)
'''

'\nImport in this stage:\n- gamereactor_dict(links, titles)\n'

In [44]:
def gamereactor_dataframe(links, titles):
    result_gamereactor = gamereactor_dict(links, titles)
    up_gamereactor_df = pd.DataFrame.from_dict(result_gamereactor, orient='index')

    return up_gamereactor_df

In [51]:
#test
#data = f"../data/test2_2021.csv"
pages = 1
links, titles = gamereactor_link_retrieve(pages)

18 new games added.


## Revogamers updater

In [38]:
def revogamers_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'revogamers'].head(30)
    return filtered_df['url_link'].tolist()

In [36]:
#Links retriever v2 - break to skip duplicates

def revogamers_link_retrieve(pages):
    links = []
    titles = []
    old_links = revogamers_latest_links(data)
    repeated = False  

    # pages parser
    for i in range(pages):
        if repeated == True:
            break
            
        url = f'https://www.revogamers.net/analisis-w/page/{i + 1}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('h2')

        # links retrieve
        for a in articles:
            link = a.find('a')['href']
            
            if link not in old_links:     
                links.append(link) 
                titles.append(a.find('a')['title'])
            else:
                print(f'{len(titles)} new games added.')
                repeated = True
                break
            
    # Deleting non review links
    for link in links:
        if 'analisis' not in link:
            links.remove(link)

    return links, titles

In [None]:
'''
Import in this stage:
- revogamers_dict(links, titles)
'''

In [23]:
def revogamers_dataframe(links, titles):
    result_revo = revogamers_dict(links, titles)
    up_revogamers_df = pd.DataFrame.from_dict(result_revo, orient='index')
    return up_revogamers_df

In [39]:
#test
data = f"../data/test2_2021.csv"
pages = 2
links, titles = revogamers_link_retrieve(pages)

16 new games added.


## meristation updater

In [43]:
def meristation_latest_links(data):
    # data via parser
    all_sites = pd.read_csv(data)
    filtered_df = all_sites[all_sites['site'] == 'meristation'].head(30)
    return filtered_df['url_link'].tolist()

In [56]:
def meristation_link_retrieve(pages):
    links = []
    old_links = meristation_latest_links(data)
    repeated = False 

    # pages parser
    for i in range(pages):
        if repeated == True:
            break

        url = f'https://as.com/meristation/analisis/{337 - i}'

        # building soup
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'lxml')
        articles = soup.find_all('h2')

        # links & game titles retrieve. New: checking if old to stop adding raws.
        for a in articles:
                                  
            if a.find('a') == None:
                pass
            else:
                link = a.find('a')['href']
                
                if link not in old_links:
                    links.append(link)
                else:
                    repeated = True
                    break
                
    # Deleting non review links
    for link in links:
        if 'analisis' not in link:
            links.remove(link)

    print(f'{len(links)} new games added.')
    return links

In [51]:
'''
Import in this stage:
- meri_author(soup)
- meri_score(soup)
- meristation_dict(links)
'''

'\nImport in this stage:\n- meri_author(soup)\n- meri_score(soup)\n- meristation_dict(links)\n'

In [52]:
def meristation_dataframe(links):
    result_meri = meristation_dict(links)
    up_meristation_df = pd.DataFrame.from_dict(result_meri, orient='index')
    return up_meristation_df

In [57]:
#test
data = f"../data/test2_2021.csv"
pages = 2
links = meristation_link_retrieve(pages)

['https://as.com/meristation/2021/04/08/analisis/1617889421_063525.html', 'https://as.com/meristation/2021/04/07/analisis/1617809548_880093.html', 'https://as.com/meristation/2021/04/06/analisis/1617694738_423461.html', 'https://as.com/meristation/2021/04/05/analisis/1617623116_949416.html', 'https://as.com/meristation/2021/04/04/analisis/1617519009_342820.html', 'https://as.com/meristation/2021/04/02/analisis/1617351377_395107.html', 'https://as.com/meristation/2021/04/01/analisis/1617273286_559321.html', 'https://as.com/meristation/2021/04/01/analisis/1617261789_188289.html', 'https://as.com/meristation/2021/03/31/analisis/1617182479_101654.html', 'https://as.com/meristation/2021/03/31/analisis/1617174009_601566.html', 'https://as.com/meristation/2021/03/30/analisis/1617120558_078339.html', 'https://as.com/meristation/2021/03/30/analisis/1617120313_978999.html', 'https://as.com/meristation/2021/03/30/analisis/1617082693_230913.html', 'https://as.com/meristation/2021/03/29/analisis/16

## Database global updater

In [None]:
def database_updater(data):
    
all_sites = pd.read_csv(data)

new_all = pd.concat([all_sites, up_vandal_df, up_gamereactor_df, up_revogamers_df, up_meristation_df], ignore_index=True, sort=False)

In [16]:
data = f"../data/test2_2021.csv"
new_data = f"../data/up_to_date_all.csv"

In [17]:
all_sites = pd.read_csv(data)

In [18]:
new = pd.read_csv(new_data)

In [54]:
len(all_sites), len(new)

(158, 250)

In [23]:
new[new['site'] =='Vandal']

Unnamed: 0,site,url_link,author,game,company,genre,platform,text,score
12,Vandal,https://vandal.elespanol.com/analisis/pc/olija...,Juan Rubio,Olija - Análisis,Devolver Digital,Aventura de acción,PC Switch PS4 Xbox One,"Para gustos los colores, dicen, pero hay que r...",7.5
23,Vandal,https://vandal.elespanol.com/analisis/ps4/scot...,Juan Rubio,Scott Pilgrim vs. The World: The Game - Comple...,Ubisoft,Acción Beat'em up,PS4 Switch Xbox One PC,El formato digital es cómodo. Muy cómodo. Pode...,7.8
25,Vandal,https://vandal.elespanol.com/analisis/ps5/maqu...,Juan Rubio,Maquette - Análisis,Annapurna Interactive,Aventura de exploración y puzles Puzle,PS5 PC PS4,"Quizás a rebufo de la saga Portal, los juegos ...",7.2
27,Vandal,https://vandal.elespanol.com/analisis/ps5/hitm...,Saúl González,Hitman 3 - Análisis,IO Interactive,Aventura de acción Sigilo,PS5 Xbox Series X/S PC PS4 Xbox One Switch,"Hitman es una saga de videojuegos veterana, un...",8.2
34,Vandal,https://vandal.elespanol.com/analisis/switch/s...,Carlos Leiva,Super Mario 3D World + Bowser's Fury - Análisis,Nintendo,Plataformas 3D Plataformas,Switch,Puede que Wii U no tuviese el éxito comercial ...,9.2
39,Vandal,https://vandal.elespanol.com/analisis/ps5/over...,Sergio Martín,Override 2: Super Mech League - Análisis,Modus Games,Lucha 3D Multijugador Online,PS5 PC Xbox Series X/S Xbox One PS4 Switch,Han pasado ya un par de años desde que Modus G...,7.0
41,Vandal,https://vandal.elespanol.com/analisis/switch/s...,Sergio Martín,Super Meat Boy Forever - Análisis,Team Meat,Plataformas 2D Plataformas,Switch PC Android iPhone PS4 Xbox One,Aquellos que disfrutamos con Super Meat Boy pr...,7.8
44,Vandal,https://vandal.elespanol.com/analisis/ps4/ys-i...,Ramón Varela,Ys IX: Monstrum Nox - Análisis,NIS America,JRPG Rol,PS4 Switch PC,El excelente Ys VIII: Lacrimosa of Dana subió ...,8.3
53,Vandal,https://vandal.elespanol.com/analisis/switch/b...,Carlos Leiva,Bravely Default 2 - Análisis,Nintendo,JRPG Rol,Switch,"Poco a poco, Nintendo Switch está construyendo...",8.2
57,Vandal,https://vandal.elespanol.com/analisis/pc/the-c...,Juan Rubio,The Climb 2 - Análisis,Crytek,Otros deportes Realidad Virtual,PC,Recientemente recibíamos la noticia de que Ocu...,7.0


In [9]:
for row in range(len(new)):
    print((new.iloc[row, ].values == all_sites.values).all())

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals