# Web Scrape Video Games
> <a href=#imports>Import Libraries</a>        

> <a href=#webscrabing>Web Scraping</a>

> <a href=#merge>Merging Data</a>

> <a href=#EDA>Explratory Data Analysis (EDA)</a>



# <a name=imports> Import Libraries</a> 


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# <a name=webscrabing>Web Scraping</a>

In [2]:
url = "https://www.imdb.com/search/title/?title_type=video_game"

response = requests.get(url)
response.status_code
page = response.text
soup = BeautifulSoup(page, "lxml")

In [3]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text.replace('(','').replace(')','')
    except AttributeError:
        return ""
    return

def get_data_a_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).a.text
    except AttributeError:
        return ""
    return
def get_data_strong_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).strong.text
    except AttributeError:
        return ""
    return
def get_data_descr(i, tag, class_text):    
    try:
        return i.find_all(tag, class_ = class_text)[1].text.strip()
    except AttributeError:
        return ""
    return


def get_data_votes(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).text.split()[1].replace(',','')
    except AttributeError:
        return ""
    return

In [4]:
    
Vgame_list = []

for i in (soup.find_all('div','lister-item mode-advanced')):
    Name = get_data_a_tag(i, "h3","lister-item-header")
    year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
    Gtype = get_data(i, "p", "text-muted").strip()
    rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
    votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
    headers = ['Name', 'year', 'game_type', 'rate','votes']
    Vgame_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
    Vgame_list.append(Vgame_dict)   

In [5]:
vedioGame = pd.DataFrame(Vgame_list) #transpose
vedioGame

Unnamed: 0,Name,year,game_type,rate,votes
0,Marvel's Guardians of the Galaxy,2021,"Action, Adventure, Comedy",9.1,925.0
1,Grand Theft Auto: San Andreas,2004,"Action, Crime",9.4,37733.0
2,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28865.0
3,Call of Duty: Vanguard,2021,"Action, Adventure, History",6.4,496.0
4,Halo Infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,
5,Grand Theft Auto V,2013,"Action, Crime, Drama",9.5,55637.0
6,Far Cry 6,2021,"Action, Adventure",8.1,812.0
7,Death Stranding,2019,"Action, Adventure, Drama",8.9,6982.0
8,The Last of Us: Part II,2020,"banned\n|\n\nAction, Adventure, Drama",8.3,26167.0
9,Ghostbusters,2009,"Action, Adventure, Comedy",8.5,2870.0


In [6]:
def get_movie_dict(link):

    #Create full url to scrape
    url =link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
  
    
    for i in (soup.find_all('div','lister-item mode-advanced')):        
        Name = get_data_a_tag(i, "h3","lister-item-header")
        year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
        Gtype = get_data(i, "p", "text-muted").strip()
        rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
        votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
        headers = ['Name', 'year', 'game_type', 'rate','votes']
        #Create game vedio dictionary and return
        movie_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
        Vgame_list.append(movie_dict) 

    return movie_dict

In [7]:
get_movie_dict("https://www.imdb.com/search/title/?title_type=video_game&start=51&ref_=adv_nxt")

{'Name': 'Teiruzu obu Araizu',
 'year': '2021',
 'game_type': 'Action, Adventure, Fantasy',
 'rate': '8.8',
 'votes': '146'}

In [8]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=101&ref_=adv_nxt')

{'Name': "Assassin's Creed II",
 'year': '2009',
 'game_type': 'Action, Adventure, Fantasy',
 'rate': '9.1',
 'votes': '21858'}

In [9]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=151&ref_=adv_nxt')

{'Name': 'Lost Judgment',
 'year': '2021',
 'game_type': 'Action, Adventure, Crime',
 'rate': '8.8',
 'votes': '92'}

In [10]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=201&ref_=adv_nxt')

{'Name': 'Devil May Cry 5',
 'year': '2019',
 'game_type': 'Action, Adventure, Fantasy',
 'rate': '8.6',
 'votes': '1725'}

In [11]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=251&ref_=adv_nxt')

{'Name': 'Outlast 2',
 'year': '2017',
 'game_type': 'Horror, Mystery, Thriller',
 'rate': '7.6',
 'votes': '1940'}

In [12]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=301&ref_=adv_nxt')

{'Name': 'Faiâ emuburemu: Fuuka setsugetsu',
 'year': '2019',
 'game_type': 'Adventure, Drama, Fantasy',
 'rate': '8.9',
 'votes': '541'}

In [13]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=351&ref_=adv_nxt')

{'Name': 'NBA 2K21',
 'year': '2020',
 'game_type': 'Sport',
 'rate': '4.5',
 'votes': '292'}

In [14]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=401&ref_=adv_nxt')

{'Name': 'Ultimate Spider-Man',
 'year': '2005',
 'game_type': 'Action, Adventure, Crime',
 'rate': '7.7',
 'votes': '1251'}

In [15]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=451&ref_=adv_nxt')

{'Name': 'Sally Face',
 'year': '2016',
 'game_type': 'Adventure, Drama, Horror',
 'rate': '8.5',
 'votes': '52'}

In [16]:
vedioGamePages = pd.DataFrame(Vgame_list) #transpose
vedioGamePages

Unnamed: 0,Name,year,game_type,rate,votes
0,Marvel's Guardians of the Galaxy,2021,"Action, Adventure, Comedy",9.1,925
1,Grand Theft Auto: San Andreas,2004,"Action, Crime",9.4,37733
2,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28865
3,Call of Duty: Vanguard,2021,"Action, Adventure, History",6.4,496
4,Halo Infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,
...,...,...,...,...,...
495,Alice: Madness Returns,2011,"Action, Adventure, Fantasy",8.3,1219
496,GreedFall,2019,"Action, Adventure, Fantasy",7.1,384
497,Guardians of the Galaxy: The Telltale Series,2017,"Action, Adventure, Comedy",7.8,789
498,Dungeons & Dragons: Dark Alliance,2021,"Action, Adventure, Fantasy",4.4,38


In [17]:
vedioGamePages.to_csv('videogame_webscraping.csv')

In [18]:
vedioGamePages.Name=vedioGamePages.Name.str.lower()
vedioGamePages.head()

Unnamed: 0,Name,year,game_type,rate,votes
0,marvel's guardians of the galaxy,2021,"Action, Adventure, Comedy",9.1,925.0
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733.0
2,red dead redemption ii,2018,"Action, Adventure, Crime",9.7,28865.0
3,call of duty: vanguard,2021,"Action, Adventure, History",6.4,496.0
4,halo infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,


In [19]:
vedioGamePages

Unnamed: 0,Name,year,game_type,rate,votes
0,marvel's guardians of the galaxy,2021,"Action, Adventure, Comedy",9.1,925
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733
2,red dead redemption ii,2018,"Action, Adventure, Crime",9.7,28865
3,call of duty: vanguard,2021,"Action, Adventure, History",6.4,496
4,halo infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,
...,...,...,...,...,...
495,alice: madness returns,2011,"Action, Adventure, Fantasy",8.3,1219
496,greedfall,2019,"Action, Adventure, Fantasy",7.1,384
497,guardians of the galaxy: the telltale series,2017,"Action, Adventure, Comedy",7.8,789
498,dungeons & dragons: dark alliance,2021,"Action, Adventure, Fantasy",4.4,38


In [20]:
df=pd.read_csv('vgsales.csv') #data from kaggle 

In [21]:
df.Name=df.Name.str.lower()
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,wii sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,super mario bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,mario kart wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,wii sports resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,pokemon red/pokemon blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [23]:
df.drop([ 'Year', 'Genre','Publisher'], axis=1, inplace=True)

# <a name=merge>Merging Data</a> 

In [24]:
mer_vidgame=vedioGamePages.merge(df, on='Name')

In [25]:
mer_vidgame.head()

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,18,PS2,9.43,0.4,0.41,10.57,20.81
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,875,XB,1.26,0.61,0.0,0.09,1.95
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,2122,PC,0.0,0.92,0.0,0.05,0.98
3,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,9829,X360,0.08,0.03,0.0,0.01,0.12
4,grand theft auto v,2013,"Action, Crime, Drama",9.5,55637,17,PS3,7.01,9.27,0.97,4.14,21.4


In [26]:
df1=pd.DataFrame(mer_vidgame)

# <a name=EDA>Explratory data analysis(EDA)</a>

In [27]:
df1.shape

(658, 12)

In [28]:
df1['game_type'].tolist()

['Action, Crime',
 'Action, Crime',
 'Action, Crime',
 'Action, Crime',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Comedy',
 'Action, Adventure, Comedy',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Drama',
 'Action, Adventure, Drama',
 'Horror, Mystery, Thriller',
 'Action, Adventure, Drama',
 'banned\n|\n\nAction, Adventure, Fantasy',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Crime',
 'Action, Ad

In [29]:
df1['game_type']=df1['game_type'].str.replace('banned\n|\n\n','')
df1['game_type']=df1['game_type'].str.replace('\n|\nAnnounced','')
df1['game_type']=df1['game_type'].str.replace('|','')
df1['game_type']=df1['game_type'].str.replace('Announced','')
df1['game_type']=df1['game_type'].str.replace('130 min\n|\n\n','')

  df1['game_type']=df1['game_type'].str.replace('banned\n|\n\n','')
  df1['game_type']=df1['game_type'].str.replace('\n|\nAnnounced','')
  df1['game_type']=df1['game_type'].str.replace('|','')
  df1['game_type']=df1['game_type'].str.replace('130 min\n|\n\n','')


In [32]:
df1

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,18,PS2,9.43,0.40,0.41,10.57,20.81
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,875,XB,1.26,0.61,0.00,0.09,1.95
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,2122,PC,0.00,0.92,0.00,0.05,0.98
3,grand theft auto: san andreas,2004,"Action, Crime",9.4,37733,9829,X360,0.08,0.03,0.00,0.01,0.12
4,grand theft auto v,2013,"Action, Crime, Drama",9.5,55637,17,PS3,7.01,9.27,0.97,4.14,21.40
...,...,...,...,...,...,...,...,...,...,...,...,...
653,prince of persia: warrior within,2004,"Action, Adventure, Fantasy",8.6,3684,7718,GC,0.15,0.04,0.00,0.01,0.20
654,ratchet & clank,2016,"Action, Adventure, Comedy",8.1,2177,405,PS2,1.44,1.01,0.57,0.30,3.33
655,alice: madness returns,2011,"Action, Adventure, Fantasy",8.3,1219,3352,PS3,0.22,0.25,0.04,0.09,0.60
656,alice: madness returns,2011,"Action, Adventure, Fantasy",8.3,1219,3956,X360,0.27,0.15,0.04,0.04,0.50


In [33]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 658 entries, 0 to 657
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          658 non-null    object 
 1   year          658 non-null    object 
 2   game_type     658 non-null    object 
 3   rate          658 non-null    object 
 4   votes         658 non-null    object 
 5   Rank          658 non-null    int64  
 6   Platform      658 non-null    object 
 7   NA_Sales      658 non-null    float64
 8   EU_Sales      658 non-null    float64
 9   JP_Sales      658 non-null    float64
 10  Other_Sales   658 non-null    float64
 11  Global_Sales  658 non-null    float64
dtypes: float64(5), int64(1), object(6)
memory usage: 66.8+ KB


In [34]:
df1['year'] = pd.to_numeric(df1['year'], errors='coerce')

In [35]:
df1['rate'] = pd.to_numeric(df1['rate'], errors='coerce')

In [36]:
df1['votes'] = pd.to_numeric(df1['votes'], errors='coerce')

In [37]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 658 entries, 0 to 657
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          658 non-null    object 
 1   year          652 non-null    float64
 2   game_type     658 non-null    object 
 3   rate          652 non-null    float64
 4   votes         652 non-null    float64
 5   Rank          658 non-null    int64  
 6   Platform      658 non-null    object 
 7   NA_Sales      658 non-null    float64
 8   EU_Sales      658 non-null    float64
 9   JP_Sales      658 non-null    float64
 10  Other_Sales   658 non-null    float64
 11  Global_Sales  658 non-null    float64
dtypes: float64(8), int64(1), object(3)
memory usage: 66.8+ KB


In [38]:
df1.describe()

Unnamed: 0,year,rate,votes,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,652.0,652.0,652.0,658.0,658.0,658.0,658.0,658.0,658.0
mean,2009.91411,8.150613,7766.173313,3775.724924,1.044635,0.657979,0.06848,0.236429,2.007933
std,4.941935,0.895928,8413.975509,3914.957989,1.885704,0.975303,0.319265,0.543974,3.203586
min,1985.0,3.1,76.0,2.0,0.0,0.0,0.0,0.0,0.01
25%,2007.0,7.8,2340.0,688.25,0.1325,0.09,0.0,0.0225,0.32
50%,2011.0,8.2,5230.0,2355.0,0.43,0.295,0.0,0.085,0.885
75%,2013.0,8.7,9715.0,5605.25,1.1775,0.8175,0.04,0.25,2.3075
max,2023.0,9.7,55838.0,16375.0,29.08,9.27,6.81,10.57,40.24


In [40]:
df1

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37733.0,18,PS2,9.43,0.40,0.41,10.57,20.81
1,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37733.0,875,XB,1.26,0.61,0.00,0.09,1.95
2,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37733.0,2122,PC,0.00,0.92,0.00,0.05,0.98
3,grand theft auto: san andreas,2004.0,"Action, Crime",9.4,37733.0,9829,X360,0.08,0.03,0.00,0.01,0.12
4,grand theft auto v,2013.0,"Action, Crime, Drama",9.5,55637.0,17,PS3,7.01,9.27,0.97,4.14,21.40
...,...,...,...,...,...,...,...,...,...,...,...,...
653,prince of persia: warrior within,2004.0,"Action, Adventure, Fantasy",8.6,3684.0,7718,GC,0.15,0.04,0.00,0.01,0.20
654,ratchet & clank,2016.0,"Action, Adventure, Comedy",8.1,2177.0,405,PS2,1.44,1.01,0.57,0.30,3.33
655,alice: madness returns,2011.0,"Action, Adventure, Fantasy",8.3,1219.0,3352,PS3,0.22,0.25,0.04,0.09,0.60
656,alice: madness returns,2011.0,"Action, Adventure, Fantasy",8.3,1219.0,3956,X360,0.27,0.15,0.04,0.04,0.50
