In [2]:
# Objective: collect review score for games in a list and add those reviews to this list
import requests
from bs4 import BeautifulSoup

# Most websites refuse GET requests from python, so we change the header to pretend we're a browser.
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}

# get webpage
page = requests.get("https://ca.ign.com/search?q=Super%20Mario%20Kart&page=0&count=10&", headers = headers)

# print status. 200 is successful, without the headers we would get 403 forbidden 
page

<Response [200]>

In [3]:
# get content
content = page.content

# show first 2000 letters of the content
str(page.content)[:2000:]

'b\'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\\n<html xmlns="https://www.w3.org/1999/xhtml" xml:lang="en" lang="en"\\n      xmlns:og="https://ogp.me/ns#"\\n      xmlns:fb="https://www.facebook.com/2008/fbml">\\n<head>\\n        <title>Search Results for &quot;Super Mario Kart&quot; - IGN</title>\\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\\n<meta name="description" content="Product search results for &amp;quot;Super Mario Kart&amp;quot; on IGN" />\\n<meta name="robots" content="noodp, noydir" />\\n<meta name="copyright" content="IGN Entertainment, Inc." />\\n<link rel="canonical" href="https://ca.ign.com/search" />\\n<link rel="alternate" hreflang="en-au" href="https://au.ign.com/search" />\\n<link rel="alternate" hreflang="en-ca" href="https://ca.ign.com/search" />\\n<link rel="alternate" hreflang="en-ie" href="https://ie.ign.com/search" />\\n<link rel="alternate" hreflang="en-

In [4]:
# Use BeautifulSoup to parse the HTML and make the content more readable

soup = BeautifulSoup(page.content, 'html.parser')
print(soup.head.prettify()) # display the <head> tag
# print(soup.prettify()) # display entire webpage

<head>
 <title>
  Search Results for "Super Mario Kart" - IGN
 </title>
 <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
 <meta content="Product search results for &amp;quot;Super Mario Kart&amp;quot; on IGN" name="description"/>
 <meta content="noodp, noydir" name="robots"/>
 <meta content="IGN Entertainment, Inc." name="copyright"/>
 <link href="https://ca.ign.com/search" rel="canonical"/>
 <link href="https://au.ign.com/search" hreflang="en-au" rel="alternate"/>
 <link href="https://ca.ign.com/search" hreflang="en-ca" rel="alternate"/>
 <link href="https://ie.ign.com/search" hreflang="en-ie" rel="alternate"/>
 <link href="https://uk.ign.com/search" hreflang="en-gb" rel="alternate"/>
 <link href="https://www.ign.com/search" hreflang="en" rel="alternate"/>
 <link href="https://www.ign.com/search" hreflang="x-default" rel="alternate"/>
 <meta content="always" name="referrer"/>
 <meta content="0ebc575d017749f715cfbd45208159af" name="cpid">
  <meta content="https://o

In [5]:
# create a list with the web content separated into childs
listOne = list(soup.children)

# print type of items in the list
[type(item) for item in listOne]

[bs4.element.Doctype,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Comment]

In [6]:
# Looking which item contains the Review, apparently it's the third
# print(listOne[2])

# check the third item and assign it to a variable
soup = listOne[2]
listTwo = list(soup.children)

# print type of items in the list
[type(item) for item in listTwo]

[bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [7]:
# Repeating... look for review tag, get item to a list, list children, repeat
soup = listTwo[3]
myList = list(soup.children)

[type(item) for item in myList]

[bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [8]:
# Realize this is boring and should be automated...
# Use fnmatch to get the index of the item containing 'review-score'
import fnmatch

# for each item in the list, convert the item to a string, search the term and print the index
for idx, item in enumerate(myList):
    tagData = str(myList[idx])
    if fnmatch.fnmatch(tagData, '*review-score*'):
        print(idx)
        
# this is better but it's still taking too long

6


In [9]:
# define function to get the index just like before and return it
def getIndex(myList, search):
    for idx, item in enumerate(myList):
        tagData = str(myList[idx])
        if fnmatch.fnmatch(tagData, search):
            return idx
        
# define search term
search = '*review-score*'

# repeat the previous processes until our soup have only one value
while len(myList) > 1:
    soup = myList[getIndex(myList, search)]
    myList = list(soup.children) 

In [10]:
# get the string
text = str(myList)
print(text)

# format the number 
text = text[4:28:]
text.strip()

['\n            9.5          ']


'9.5'

In [11]:
# I don't want to keep searching for the games and copy-pasting urls, so I tried selenium for this process

import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys

url = "https://ca.ign.com/search?q=Super%20Mario%20Kart&page=0&count=10&"

# Chromium driver -- http://chromedriver.chromium.org/
# from selenium.webdriver.common.keys import Keys
# driver = webdriver.Chrome(executable_path="chromedriver.exe")

# Gecko driver marionetting Mozilla Firefox -- https://github.com/mozilla/geckodriver/releases
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(capabilities=cap, executable_path="geckodriver.exe")

# Webpage takes too long to open, so we just load it for 8 seconds and stop it, this should be enough
# Define time and timeout
t = time.time()
driver.set_page_load_timeout(8)

# try loading the page, catch the timeout exception and stop loading
try:
    driver.get(url)
except TimeoutException:
    driver.execute_script("window.stop();")
print('Time consuming:', time.time() - t)

# get the search block, clear, input new search and hit Enter
elem = driver.find_element_by_id("query-input")
elem.clear()
elem.send_keys("Red Dead Redemption")
elem.send_keys(Keys.RETURN)
#search_button.click()

# get new url
print(driver.current_url)
# quit
driver.quit()

Time consuming: 8.00308609008789
https://ca.ign.com/search?q=Red%20Dead%20Redemption&page=0&count=10&


In [12]:
# Now let's try to pack up the knowledge and build the solution

In [13]:
import time
import fnmatch
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys

t1 = time.time()

# SELENIUM
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(capabilities=cap, executable_path="geckodriver.exe")

# load .csv to pandas dataframe
dataFrame = pd.read_csv('Video_Games_Sales_2016.csv', encoding='utf-8')
gameTitles = dataFrame['Name']
urlList = []

# Search a term and return the resulting url
def getUrl(search):
    # get the search block, clear, input new search and hit Enter
    elem = driver.find_element_by_id("query-input")
    elem.clear()
    elem.send_keys(search)
    elem.send_keys(Keys.RETURN)
    time.sleep(1)
    # return search result url
    return driver.current_url

# Go to page, try loading it for 8 seconds and stop
def goTo(url):
    driver.set_page_load_timeout(8)
    try:
        driver.get(url)
    except TimeoutException:
        driver.execute_script("window.stop();")

# go to a webpage to start
goTo("https://ca.ign.com/search?q=Super%20Mario%20Kart&page=0&count=10&")

# search the games from the .csv and save the result url to a list
for item in gameTitles:
    urlList.append(str(getUrl(item)))
    
# quit
driver.quit()

# measure checkpoint 1
t2 = time.time()
print('time to get urls: ',t2 - t1)

#BEAUTIFUL SOUP

# get the content and return a list
def getContent(url):
    # Most websites refuse GET requests from python, so we change the header to pretend we're a browser.
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
    page = requests.get(url, headers = headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    return list(soup.children)

# Search a list and return the first matching index    
def getIndex(myList, search):
    for idx, item in enumerate(myList):
        tagData = str(myList[idx])
        if fnmatch.fnmatch(tagData, search):
            return idx

# refine by seeking a term until your list reach the defined limit of items
def lookup(myList, desiredSize):
    try:
        while len(myList) > desiredSize:
            soup = myList[getIndex(myList, '*review-score*')]
            myList = list(soup.children)
    except:
        myList = []
    return myList

# look for attribute href in the anchors and append the found link to the webpages list
def getAttr(anchors):
    attr = {'webpage':'','game':''}
    for a in anchors:
        if a.has_attr('href'):
            attr['webpage']=a['href']
            attr['game']=str(a['href']).split("/")[4]
            break
    return attr


# Define the lists where the results will be saved
scores = []
webpages = []
games = []

# Check every url of the list created with selenium
for url in urlList:
    # get content and refine list to 3 items
    time.sleep(1)
    myList = getContent(url)
    myList = lookup(myList, 3)
    # try getting the scores, if not found record as null 
    try:
        # get the item containing the href and list all the anchors
        soup = myList[getIndex(myList, '*https://ca.ign.com/games/*')]
        anchors = soup.findAll('a')
        # get the link and the game name from the anchor and append them to their lists
        attributes = getAttr(anchors)
        webpages.append(attributes['webpage'])
        games.append(attributes['game'])

        # get the review and format it properlly
        myList = lookup(myList, 1)    
        text = str(myList)[5:27:]
        scores.append(text.strip())  
    except:
        webpages.append(None)
        games.append(None)
        scores.append(None)
        
# print performance
t3 = time.time()
print('time to get scores: ',t3-t2)
print('total time: ',t3-t1)

# add all lists to dataframe and display headers
dataFrame['IGN_title'] = games
dataFrame['IGN_source'] = webpages
dataFrame['IGN_score'] = scores
dataFrame.head()

time to get urls:  120.33420062065125
time to get scores:  153.2741870880127
total time:  273.60838770866394


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,IGN_title,IGN_source,IGN_score
0,Wii Sports,Wii,2006,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E,wii-sports,https://ca.ign.com/games/wii-sports/wii-826987,7.5
1,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,,new-super-mario-bros,https://ca.ign.com/games/new-super-mario-bros/...,9.5
2,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E,mario-kart-wii,https://ca.ign.com/games/mario-kart-wii/wii-94...,8.5
3,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E,wii-sports-resort,https://ca.ign.com/games/wii-sports-resort/wii...,7.7
4,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,,pokemon-blue-version,https://ca.ign.com/games/pokemon-blue-version/...,10.0


In [14]:
# dataFrame[['Name','IGN_title','IGN_source']]

In [16]:
# The Data was collected properlly but if we take a proper look some of the records don't match
# On the second row for example Super Mario Bros for NES brought the score of New Super Mario Bros for DS
# So we need some cleaning



def remove_from_text(text, cList):
    for c in cList :
        cleanText = text.replace(c, '')
    return  cleanText

# remove spaces, dots and dashs and then compare them 
def check(row):
    chars = ['.',':','!']
    # check if it wasn't removed already
    if(row['IGN_title'] != None):
        # cleaning
        original = row['Name'].replace(' ','')
        original = original.replace('.','')
        original = remove_from_text(original, chars)
        original = original.lower()
        new = row['IGN_title'].replace('-','')
        # comparing
        if (original != new):
            print("values on index", row.name, "don't match")
            print(row['Name'], " differs from ", row['IGN_title'], '\n')

# executing        
newDF = dataFrame.apply(check, axis=1)

# Some games as wii play and pokemon are the same but don't have the same name
# We can check if the consoles match before listing as unmatched 
# maybe for a bigger dataset this solution might be a problem, but for this it would work

values on index 1 don't match
Super Mario Bros.  differs from  new-super-mario-bros 

values on index 4 don't match
Pokemon Red/Pokemon Blue  differs from  pokemon-blue-version 

values on index 5 don't match
Tetris  differs from  tetris-99 

values on index 7 don't match
Wii Play  differs from  wii-play-motion 

values on index 18 don't match
Super Mario World  differs from  super-mario-wii-u 

values on index 19 don't match
Brain Age: Train Your Brain in Minutes a Day  differs from  brain-age-train-your-brain-in-minutes-a-day 

values on index 20 don't match
Pokemon Diamond/Pokemon Pearl  differs from  pokemon-pearl-version 

values on index 22 don't match
Super Mario Bros. 3  differs from  new-super-mario-bros-u-deluxe 

values on index 24 don't match
Grand Theft Auto: Vice City  differs from  grand-theft-auto-vice-city-stories 

values on index 25 don't match
Pokemon Ruby/Pokemon Sapphire  differs from  pokemon-sapphire-version 

values on index 26 don't match
Brain Age 2: More Tra

In [17]:
# Applying and improving

# remove dots, replace dashs for spaces and make it lower case
def plainText(text):
    text = text.replace('.', r'').replace('-', r' ').lower()
    return text

# replace console names to match the original standards and apply lower case
def consoleName(text):
    text = text.replace('nds',r'ds').replace('nintendo',r'switch').lower()
    return text
    
# clean game titles and compare them, if they're different compare the console, if still different save index to a list
def clean(row, noMatch):
    # check if it wasn't removed already
    if(row['IGN_title'] != None):
        # clean names
        original = plainText(row['Name'])
        new = plainText(row['IGN_title'])
        # compare names
        if (original != new):
            console = row['IGN_source'].split('/')[5].split('-')[0]
            # compare console and remove unmatched values
            if (consoleName(console) != consoleName(row['Platform'])):
                noMatch.append(row.name)
                row['IGN_title'] = None
                row['IGN_source'] = None
                row['IGN_score'] = None

# list that'll contain the unmatched values
noMatch = []
# clean names,print unmatched indexes
dataFrame.apply(clean, args = (noMatch,), axis=1)
print(noMatch)

# remove values
for idx in noMatch:
    dataFrame.loc[idx,'IGN_title'] = None
    dataFrame.loc[idx,'IGN_source'] = None
    dataFrame.loc[idx,'IGN_score'] = None
    
dataFrame.to_csv('ign.csv', index=False)

[1, 5, 14, 18, 22, 24, 32, 34, 35, 36, 45, 55, 56, 60, 64, 69, 70, 75, 76, 84, 87, 92, 96, 98]
