In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
url = "https://www.imdb.com/search/title/?title_type=video_game&num_votes=,5000,&sort=user_rating,desc&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=87cca6a7-a16d-42d9-b9de-6aace99ec40a&pf_rd_r=ERFX36S273PQKZHMN3NF&pf_rd_s=center-6&pf_rd_t=60601&pf_rd_i=video-games&ref_=fea_vg_scg_ats_toprated_hd"

response = requests.get(url)
response.status_code
page = response.text
soup = BeautifulSoup(page, "lxml")

In [3]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text.replace('(','').replace(')','')
    except AttributeError:
        return ""
    return

def get_data_a_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).a.text
    except AttributeError:
        return ""
    return
def get_data_strong_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).strong.text
    except AttributeError:
        return ""
    return
def get_data_descr(i, tag, class_text):    
    try:
        return i.find_all(tag, class_ = class_text)[1].text.strip()
    except AttributeError:
        return ""
    return


def get_data_votes(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).text.split()[1].replace(',','')
    except AttributeError:
        return ""
    return

In [25]:
    
Vgame_list = []

for i in (soup.find_all('div','lister-item mode-advanced')):
    Name = get_data_a_tag(i, "h3","lister-item-header")
    year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
    Gtype = get_data(i, "p", "text-muted").strip()
    rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
    votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
    headers = ['Name', 'year', 'game_type', 'rate','votes']
    Vgame_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
    Vgame_list.append(Vgame_dict)    

In [26]:
vedioGame = pd.DataFrame(Vgame_list) #transpose
vedioGame

Unnamed: 0,Name,year,game_type,rate,votes
0,Wiedzmin 3: Dziki Gon Krew i wino,2016,"Action, Adventure, Drama",9.8,6822
1,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28770
2,Wiedzmin 3: Dziki Gon,2015,"banned\n|\n\nAction, Adventure, Drama",9.7,23058
3,The Last of Us,2013,"Action, Adventure, Drama",9.7,55759
4,Mass Effect: Legendary Edition,2021,Sci-Fi,9.7,725
5,God of War,2018,"Action, Adventure, Drama",9.6,21348
6,Perusona 5: Za roiyaru,2019,"Action, Adventure, Fantasy",9.6,801
7,Zeruda no densetsu: Toki no okarina,1998,"Action, Adventure, Fantasy",9.6,9007
8,Metal Gear Solid,1998,"Action, Adventure, Drama",9.6,11332
9,Wiedzmin 3: Dziki Gon - Serca z kamienia,2015,"Action, Adventure, Drama",9.5,5688


In [33]:
def get_movie_dict(link):

    #Create full url to scrape
    url =link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
  
    
    for i in (soup.find_all('div','lister-item mode-advanced')):        
        Name = get_data_a_tag(i, "h3","lister-item-header")
        year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
        Gtype = get_data(i, "p", "text-muted").strip()
        rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
        votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
        headers = ['Name', 'year', 'game_type', 'rate','votes']
        #Create game vedio dictionary and return
        movie_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
        Vgame_list.append(movie_dict) 

    return movie_dict

In [34]:
get_movie_dict("https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc&start=101&ref_=adv_nxt")

{'Name': 'Ratchet & Clank: Rift Apart',
 'year': '2021',
 'game_type': 'Action, Adventure, Comedy',
 'rate': '9.0',
 'votes': '797'}

In [36]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc&start=151&ref_=adv_nxt')

{'Name': 'Super Mario Galaxy 2',
 'year': '2010',
 'game_type': 'Action, Adventure, Family',
 'rate': '8.9',
 'votes': '2249'}

In [40]:
vedioGamePages = pd.DataFrame(Vgame_list) #transpose
vedioGamePages

Unnamed: 0,Name,year,game_type,rate,votes
0,Wiedzmin 3: Dziki Gon Krew i wino,2016,"Action, Adventure, Drama",9.8,6822
1,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28770
2,Wiedzmin 3: Dziki Gon,2015,"banned\n|\n\nAction, Adventure, Drama",9.7,23058
3,The Last of Us,2013,"Action, Adventure, Drama",9.7,55759
4,Mass Effect: Legendary Edition,2021,Sci-Fi,9.7,725
...,...,...,...,...,...
195,Kyokugen dasshutsu: adobencha zennin shibo desu,2012,"Adventure, Mystery, Sci-Fi",8.9,217
196,Far Cry 3,2012,"Action, Adventure",8.9,17541
197,South Park: The Stick of Truth,2014,"banned\n|\n\nAction, Adventure, Comedy",8.9,4389
198,Wiedzmin 2: Zabójcy królów,2011,"Action, Adventure, Drama",8.9,5081


In [41]:
vedioGamePages.to_csv('videogame.csv')