# Webscraping from Wikipeida page for Nintendo games

In [130]:
import numpy as np
import pandas as pd
import scipy as sp
import requests
import sqlite3
import lxml.html
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
py.offline.init_notebook_mode(connected=True)

In [131]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Nintendo_Entertainment_System_games")

In [132]:
r

<Response [200]>

#We can get the entire html source code by runnung r.text

In [133]:
r.text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of Nintendo Entertainment System games - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_Nintendo_Entertainment_System_games","wgTitle":"List of Nintendo Entertainment System games","wgCurRevisionId":871852078,"wgRevisionId":871852078,"wgArticleId":142577,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: BOT: original-url status unknown","Nintendo-related lists","Video game lists by platform","Nintendo Entertainment System games"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSep

#HTML code has hierarchy - Parent & Child elements. New variable html can be created to sequence all rows in table

In [134]:
sample = lxml.html.fromstring(r.text)

In [135]:
help(sample)

Help on HtmlElement in module lxml.html object:

class HtmlElement(lxml.etree.ElementBase, HtmlMixin)
 |  ElementBase(*children, attrib=None, nsmap=None, **_extra)
 |  
 |  The public Element class.  All custom Element classes must inherit
 |  from this one.  To create an Element, use the `Element()` factory.
 |  
 |  __new__ as it is absolutely undefined when these objects will be
 |  created or destroyed.  All persistent state of Elements must be
 |  stored in the underlying XML.  If you really need to initialize
 |  the object after creation, you can implement an ``_init(self)``
 |  method that will be called directly after object creation.
 |  
 |  Subclasses of this class can be instantiated to create a new
 |  Element.  By default, the tag name will be the class name and the
 |  namespace will be empty.  You can modify this with the following
 |  class attributes:
 |  
 |  * TAG - the tag name, possibly containing a namespace in Clark
 |    notation
 |  
 |  * NAMESPACE - the def

In [136]:
sample

<Element html at 0x1ff9a4b4638>

#In the website, the table we are looking for has a table id = softwarelist

In [137]:
software_list_table = sample.get_element_by_id("softwarelist")

#Lets check what is in the software_list_table- It is a table with elements.

In [138]:
software_list_table

<Element table at 0x1ff9a4b69f8>

#Lets get the children of the table (contents)- This results in table body. Rows in the table are children of table_body <tbody>. To get all rows we call children of the first item in table body. Lets assign table_body to this code.

In [139]:
software_list_table.getchildren()

[<Element tbody at 0x1ff9a4b6bd8>]

In [140]:
table_body = software_list_table.getchildren()[0]

In [141]:
len(table_body)

714

There are 714 total rows in the table. Looking at the table on the website, first 2 are headers. So 712 items are listed.
Note: The Wikipedia site says - There are a total of 712 known licensed game titles of which 677 were released in North America. 35 were exclusively released outside North America, fully listed below. This is different from the 714 that we saw in the videos. New update?

#Now we can pick the first row and get text content for that- that would show the text in first row. There are text related to the 4 Columns: Title, Release date, Publisher and Developer

In [142]:
table_body.getchildren()[0].text_content()

'\nTitle[3][4]\n\nRelease date\n\nPublisher(s)[3][4][5]\n\nDeveloper(s)\n'

#Now we can pick the first row and get children for that- that would show the 4 columns in first row. These are table headers and therefore the result shows 'th' tags for each.


In [143]:
table_body.getchildren()[0].getchildren()

[<Element th at 0x1ff9a4c9cc8>,
 <Element th at 0x1ff9a4c9d18>,
 <Element th at 0x1ff9a4c9d68>,
 <Element th at 0x1ff9a4c9db8>]

On the website, the second row has 2 columns (NA and PAL). Lets check the second row and get children for that- that would show the 2 columns. These are also table headers and therefore the result shows 'th' tags for each.

In [144]:
table_body.getchildren()[1].text_content()

'\nNA[3][4][5]\n\nPAL[6]\n'

In [145]:
table_body.getchildren()[1].getchildren()

[<Element th at 0x1ff9a4c99f8>, <Element th at 0x1ff9a4c9f48>]

Actual content starts in 3rd row. Lets check.

In [146]:
table_body.getchildren()[2].text_content()

'\n10-Yard Fight\nOctober 18, 1985\nDecember 6, 1986\nNintendo\nIrem\n'

In [147]:
table_body.getchildren()[2].getchildren()

[<Element td at 0x1ff9a4c9408>,
 <Element td at 0x1ff9a4c9ef8>,
 <Element td at 0x1ff9a4c9098>,
 <Element td at 0x1ff9a4c9458>,
 <Element td at 0x1ff9a4c94a8>]

By reading the text content for 3rd row (as above) we can see all text in the row, but it also shows all the new line charecters. To see the data clearly, we can print the data in each cell as below:

In [148]:
print(table_body.getchildren()[2].getchildren()[0].text_content())
print(table_body.getchildren()[2].getchildren()[1].text_content())
print(table_body.getchildren()[2].getchildren()[2].text_content())
print(table_body.getchildren()[2].getchildren()[3].text_content())
print(table_body.getchildren()[2].getchildren()[4].text_content())

10-Yard Fight
October 18, 1985
December 6, 1986
Nintendo
Irem



At this point we have scraped the table data from Wikipedia website and it is ready to be parsed using Pandas.
Lets define a pandas dataframe using a dictionary with keys corresponding to the table headers and empty lists to be filled in later using dynamic variables. The pandas dictionary will have 5 keys :tiele, release_date_NA, release_date_PAL, publisher, developer.

In [149]:
data = {
    "title" : [],
    "release_date_NA" : [],
    "release_date_PAL" : [],
    "publishers" : [],
    "developers" : []
}

Now that the dictinoary is setup, we loop over each row in the table_body starting from row 3 (index 2) using for loop and then append to the empty lists. Remove new line charecters using strip(). Try with 10 rows first.

In [150]:
for row in table_body.getchildren()[2:10]:
    data["title"].append(row.getchildren()[0].text_content().strip())
    data["release_date_NA"].append(row.getchildren()[1].text_content().strip())
    data["release_date_PAL"].append(row.getchildren()[2].text_content().strip())
    data["publishers"].append(row.getchildren()[3].text_content().strip())
    data["developers"].append(row.getchildren()[4].text_content().strip())    

In [151]:
df = pd.DataFrame(data)

In [152]:
df

Unnamed: 0,title,release_date_NA,release_date_PAL,publishers,developers
0,10-Yard Fight,"October 18, 1985","December 6, 1986",Nintendo,Irem
1,1942,November 1986,Unreleased,Capcom,Micronics
2,1943: The Battle of Midway,October 1988,Unreleased,Capcom,Capcom
3,The 3-D Battles of WorldRunner,September 1987,Unreleased,Acclaim Entertainment,Square
4,720°,November 1989,Unreleased,Mindscape,Tengen
5,8 Eyes,January 1990,Unreleased,Taxan,Thinking Rabbit
6,Abadox,March 1990,Unreleased,Milton Bradley Company,Natsume
7,The Addams Family,January 1992,December 1992,Ocean Software,Ocean Software


Now extend the code to all data / rows in the table

In [153]:
for row in table_body.getchildren()[2:]:
    data["title"].append(row.getchildren()[0].text_content().strip())
    data["release_date_NA"].append(row.getchildren()[1].text_content().strip())
    data["release_date_PAL"].append(row.getchildren()[2].text_content().strip())
    data["publishers"].append(row.getchildren()[3].text_content().strip())
    data["developers"].append(row.getchildren()[4].text_content().strip())

IndexError: list index out of range

This is showing errors. The error is in Developers column (5th or last column) and the index is out of range. So the index 4 is incorect. This can happen if there are less than 5 items in some rows. We can check this using lofical if-else statement for the 5th column.
Note: If we want to add the if-else statement, the entire code needs to copued and pasted, including creating the dictionary with empty lists. We cannot use the dictionary created earlier. Errors will occur if dictionary with empty lists is not created again.

In [156]:
data = {
    "title" : [],
    "release_date_NA" : [],
    "release_date_PAL" : [],
    "publishers" : [],
    "developers" : []
}
for row in table_body.getchildren()[2:]:
    data["title"].append(row.getchildren()[0].text_content().strip())
    data["release_date_NA"].append(row.getchildren()[1].text_content().strip())
    data["release_date_PAL"].append(row.getchildren()[2].text_content().strip())
    data["publishers"].append(row.getchildren()[3].text_content().strip())
    if len(row.getchildren()) < 5:
        data["developers"].append("TEST")
    else:
        data["developers"].append(row.getchildren()[4].text_content().strip())

In [157]:
df = pd.DataFrame(data)

In [158]:
df

Unnamed: 0,title,release_date_NA,release_date_PAL,publishers,developers
0,10-Yard Fight,"October 18, 1985","December 6, 1986",Nintendo,Irem
1,1942,November 1986,Unreleased,Capcom,Micronics
2,1943: The Battle of Midway,October 1988,Unreleased,Capcom,Capcom
3,The 3-D Battles of WorldRunner,September 1987,Unreleased,Acclaim Entertainment,Square
4,720°,November 1989,Unreleased,Mindscape,Tengen
5,8 Eyes,January 1990,Unreleased,Taxan,Thinking Rabbit
6,Abadox,March 1990,Unreleased,Milton Bradley Company,Natsume
7,The Addams Family,January 1992,December 1992,Ocean Software,Ocean Software
8,The Addams Family: Pugsley's Scavenger Hunt,August 1993,1992,Ocean Software,Ocean Software
9,Advanced Dungeons & Dragons: DragonStrike,July 1992,Unreleased,FCI,Westwood Studios


In [159]:
len(df)

712

This data frame (df) has 712 rows and is now ready for further explore data using pandas or to be loaded in to the sqlite3 database.

In [165]:
trial = df[df.publishers.isin(["Konami", "Capcom"])]

In [166]:
trial

Unnamed: 0,title,release_date_NA,release_date_PAL,publishers,developers
1,1942,November 1986,Unreleased,Capcom,Micronics
2,1943: The Battle of Midway,October 1988,Unreleased,Capcom,Capcom
16,Adventures in the Magic Kingdom,June 1990,"December 10, 1992",Capcom,Capcom
17,The Adventures of Bayou Billy,June 1989,"January 24, 1991",Konami,Konami
68,Batman Returns,January 1993,1993,Konami,Konami
80,Bill Elliott's NASCAR Challenge,December 1991,Unreleased,Konami,Distinctive Software
81,Bionic Commando,December 1988,"October 26, 1990",Capcom,Capcom
107,Capcom's Gold Medal Challenge '92,August 1992,"June 17, 1993",Capcom,Make Software
116,Castlevania,May 1987,"December 19, 1988",Konami,Konami
117,Castlevania II: Simon's Quest,December 1988,"April 27, 1990",Konami,Konami


In [167]:
len(trial)

63

Let us load the new data frame trial in to the sqlite3 database.

The database operation and data can either be stored as a file in the computer by defining a file name or be stored in the process memory to be made available as long as the Notbook kernel is running. This is temporary and requires special syntax :  db = sqlite3.connect(":memory:").
In this example, we will save the database on the computer.

In [168]:
conn = sqlite3.connect('webscraping.db')
c = conn.cursor()

Lets create a new table called Example in the new database called webscraping.db

In [176]:
c.execute("""
    CREATE TABLE Example(Title, NA, PAL, Publishers, Developers)
""")

<sqlite3.Cursor at 0x1ff9b5e1f10>

In [177]:
for row in trial.itertuples():
    insert_table = """
    INSERT INTO Example(Title, NA, PAL, Publishers, Developers) \
    VALUES (?,?,?,?,?)
"""
    c.execute(insert_table, row[1:])
conn.commit()

A new database called webscraping.db has been created with a new table called Example with 63 rows of data.
We can now retrieve data from the new database.
Lets query for all games from Developers called Micronics

In [178]:
for row in c.execute("""
    SELECT *
    FROM Example
    WHERE Developers = "Micronics"
"""):
    print(row)

('1942', 'November 1986', 'Unreleased', 'Capcom', 'Micronics')
("Ghosts 'n Goblins", 'November 1986', 'March 23, 1989', 'Capcom', 'Micronics')


Lets query for all games for which Publishers are Knoami 

In [181]:
for row in c.execute("""
    SELECT *
    FROM Example
    WHERE Publishers = "Konami"
"""):
    print(row)

('The Adventures of Bayou Billy', 'June 1989', 'January 24, 1991', 'Konami', 'Konami')
('Batman Returns', 'January 1993', '1993', 'Konami', 'Konami')
("Bill Elliott's NASCAR Challenge", 'December 1991', 'Unreleased', 'Konami', 'Distinctive Software')
('Castlevania', 'May 1987', 'December 19, 1988', 'Konami', 'Konami')
("Castlevania II: Simon's Quest", 'December 1988', 'April 27, 1990', 'Konami', 'Konami')
('Contra\xa0(NA)Probotector\xa0(EU)', 'February 1988', 'December 28, 1990', 'Konami', 'Konami')
('Contra Force', 'September 1992', 'Unreleased', 'Konami', 'Konami')
('Double Dribble', 'September 1987', '1988', 'Konami', 'Konami')
('The Goonies II', 'November 1987', 'December 19, 1988', 'Konami', 'Konami')
('Gradius', 'December 1986', 'November 30, 1988', 'Konami', 'Konami')
("Jack Nicklaus' Greatest 18 Holes of Major Championship Golf", 'March 1990', 'June 27, 1991', 'Konami', 'Sculptured Software')
('Jackal', 'September 1988', 'January 1, 1989', 'Konami', 'Konami')
("King's Quest V: 

Lets make the data more readable, by printing each element per row and using for loop

In [184]:
data = c.execute("""SELECT * 
                    FROM Example 
                    WHERE Publishers= "Konami"
""")
for row in data:
   print("title = ", row[0])
   print("NA = ", row[1])
   print("PAL = ", row[2])
   print("publishers = ", row[3])
   print("developers= ", row[4], "\n")
conn.close()

title =  The Adventures of Bayou Billy
NA =  June 1989
PAL =  January 24, 1991
publishers =  Konami
developers=  Konami 

title =  Batman Returns
NA =  January 1993
PAL =  1993
publishers =  Konami
developers=  Konami 

title =  Bill Elliott's NASCAR Challenge
NA =  December 1991
PAL =  Unreleased
publishers =  Konami
developers=  Distinctive Software 

title =  Castlevania
NA =  May 1987
PAL =  December 19, 1988
publishers =  Konami
developers=  Konami 

title =  Castlevania II: Simon's Quest
NA =  December 1988
PAL =  April 27, 1990
publishers =  Konami
developers=  Konami 

title =  Contra (NA)Probotector (EU)
NA =  February 1988
PAL =  December 28, 1990
publishers =  Konami
developers=  Konami 

title =  Contra Force
NA =  September 1992
PAL =  Unreleased
publishers =  Konami
developers=  Konami 

title =  Double Dribble
NA =  September 1987
PAL =  1988
publishers =  Konami
developers=  Konami 

title =  The Goonies II
NA =  November 1987
PAL =  December 19, 1988
publishers =  Kona