### appendix 

> <a href=#imports>Import Libraries</a>        

> <a href=#webscrabing>Web Scraping</a>

> <a href=#merge>Merging Data</a>

> <a href=#EDA>Explratory Data Analysis (EDA)</a>

> <a href=#LR>Linear Regression</a>

> <a href=#LR>Conclusion</a>

# <a name=imports>Import Libraries</a>

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# <a name=webscrabing>Web Scraping</a>

In [2]:
url = "https://www.imdb.com/search/title/?title_type=video_game"

response = requests.get(url)
response.status_code
page = response.text
soup = BeautifulSoup(page, "lxml")

In [3]:
def clean_num(s):
    """cleans text scraped from website
    
    Parameters:
    s (String): raw text

    Returns:
    String: cleaned digit text

    """ 

    if s is None:
        return s
    return "".join(re.findall('\d*\.?\d+',s))

def get_data(block, tag, class_text):
    """fetch data from tag and its associated class
    
    Parameters:
    block (int): Book item 
    tag (String): tag for element we want to fetch
    class_text (String): class for element we want to fetch

    Returns:
    String: data text fetched

    """ 
    try:
        return block.find(tag, class_ = class_text).text.replace('(','').replace(')','')
    except AttributeError:
        return ""
    return

def get_data_a_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).a.text
    except AttributeError:
        return ""
    return
def get_data_strong_tag(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).strong.text
    except AttributeError:
        return ""
    return
def get_data_descr(i, tag, class_text):    
    try:
        return i.find_all(tag, class_ = class_text)[1].text.strip()
    except AttributeError:
        return ""
    return


def get_data_votes(i, tag, class_text):    
    try:
        return i.find(tag, class_ = class_text).text.split()[1].replace(',','')
    except AttributeError:
        return ""
    return

In [4]:
    
Vgame_list = []

for i in (soup.find_all('div','lister-item mode-advanced')):
    Name = get_data_a_tag(i, "h3","lister-item-header")
    year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
    Gtype = get_data(i, "p", "text-muted").strip()
    rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
    votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
    headers = ['Name', 'year', 'game_type', 'rate','votes']
    Vgame_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
    Vgame_list.append(Vgame_dict)    

In [5]:
vedioGame = pd.DataFrame(Vgame_list) #transpose
vedioGame

Unnamed: 0,Name,year,game_type,rate,votes
0,Marvel's Guardians of the Galaxy,2021,"Action, Adventure, Comedy",9.1,869.0
1,Grand Theft Auto: San Andreas,2004,"Action, Crime",9.4,37712.0
2,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28793.0
3,Call of Duty: Vanguard,2021,"Action, Adventure, History",6.4,472.0
4,Halo Infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,
5,Grand Theft Auto V,2013,"Action, Crime, Drama",9.5,55593.0
6,Far Cry 6,2021,"Action, Adventure",8.1,783.0
7,Death Stranding,2019,"Action, Adventure, Drama",8.9,6966.0
8,The Last of Us: Part II,2020,"banned\n|\n\nAction, Adventure, Drama",8.3,26120.0
9,Ghostbusters,2009,"Action, Adventure, Comedy",8.5,2864.0


In [6]:
def get_movie_dict(link):

    #Create full url to scrape
    url =link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
  
    
    for i in (soup.find_all('div','lister-item mode-advanced')):        
        Name = get_data_a_tag(i, "h3","lister-item-header")
        year = get_data(i, "span", "lister-item-year text-muted unbold").split()[0]
        Gtype = get_data(i, "p", "text-muted").strip()
        rate = clean_num(get_data_strong_tag(i, "div","ratings-bar"))
        votes = clean_num(get_data_votes(i, "p","sort-num_votes-visible"))
    
        headers = ['Name', 'year', 'game_type', 'rate','votes']
        #Create game vedio dictionary and return
        movie_dict = dict(zip(headers, [ Name,
                                    year,
                                    Gtype,
                                    rate,
                                    votes]))
        Vgame_list.append(movie_dict) 

    return movie_dict

In [7]:
get_movie_dict("https://www.imdb.com/search/title/?title_type=video_game&start=51&ref_=adv_nxt")

{'Name': 'Teiruzu obu Araizu',
 'year': '2021',
 'game_type': 'Action, Adventure, Fantasy',
 'rate': '8.7',
 'votes': '141'}

In [8]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=101&ref_=adv_nxt')

{'Name': "Assassin's Creed II",
 'year': '2009',
 'game_type': 'Action, Adventure, Fantasy',
 'rate': '9.1',
 'votes': '21842'}

In [9]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=151&ref_=adv_nxt')

{'Name': 'Lost Judgment',
 'year': '2021',
 'game_type': 'Action, Adventure, Crime',
 'rate': '8.8',
 'votes': '90'}

In [10]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=201&ref_=adv_nxt')

{'Name': 'Devil May Cry 5',
 'year': '2019',
 'game_type': 'Action, Adventure, Fantasy',
 'rate': '8.6',
 'votes': '1723'}

In [11]:
get_movie_dict('https://www.imdb.com/search/title/?title_type=video_game&start=251&ref_=adv_nxt')

{'Name': 'Outlast 2',
 'year': '2017',
 'game_type': 'Horror, Mystery, Thriller',
 'rate': '7.6',
 'votes': '1935'}

In [12]:
vedioGamePages = pd.DataFrame(Vgame_list) #transpose
vedioGamePages

Unnamed: 0,Name,year,game_type,rate,votes
0,Marvel's Guardians of the Galaxy,2021,"Action, Adventure, Comedy",9.1,869
1,Grand Theft Auto: San Andreas,2004,"Action, Crime",9.4,37712
2,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,28793
3,Call of Duty: Vanguard,2021,"Action, Adventure, History",6.4,472
4,Halo Infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,
...,...,...,...,...,...
295,The Wolf Among Us,2013,"Action, Crime, Drama",8.9,6572
296,Kingdom Come: Deliverance,2018,"Adventure, Drama, History",8.3,1003
297,Perusona 5: Za roiyaru,2019,"Action, Adventure, Fantasy",9.6,805
298,True Crime: New York City,2005,"Action, Adventure, Crime",7.0,897


In [13]:
vedioGamePages.to_csv('videogame.csv')

In [14]:
vedioGamePages.Name=vedioGamePages.Name.str.lower()
vedioGamePages.head()

Unnamed: 0,Name,year,game_type,rate,votes
0,marvel's guardians of the galaxy,2021,"Action, Adventure, Comedy",9.1,869.0
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712.0
2,red dead redemption ii,2018,"Action, Adventure, Crime",9.7,28793.0
3,call of duty: vanguard,2021,"Action, Adventure, History",6.4,472.0
4,halo infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,


In [15]:
vedioGamePages

Unnamed: 0,Name,year,game_type,rate,votes
0,marvel's guardians of the galaxy,2021,"Action, Adventure, Comedy",9.1,869
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712
2,red dead redemption ii,2018,"Action, Adventure, Crime",9.7,28793
3,call of duty: vanguard,2021,"Action, Adventure, History",6.4,472
4,halo infinite,2021,"Action, Adventure, Sci-Fi \n|\nComp...",,
...,...,...,...,...,...
295,the wolf among us,2013,"Action, Crime, Drama",8.9,6572
296,kingdom come: deliverance,2018,"Adventure, Drama, History",8.3,1003
297,perusona 5: za roiyaru,2019,"Action, Adventure, Fantasy",9.6,805
298,true crime: new york city,2005,"Action, Adventure, Crime",7.0,897


### 1.Load the dataframe2


In [16]:
df=pd.read_csv('vgsales.csv') #data from kaggle 

In [17]:
df.Name=df.Name.str.lower()
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,wii sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,super mario bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,mario kart wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,wii sports resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,pokemon red/pokemon blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# <a name=merge>Merging Data</a> 

In [18]:
mer_vidgame=vedioGamePages.merge(df, on='Name')

In [19]:
mer_vidgame.head()

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,18,PS2,2004.0,Action,Take-Two Interactive,9.43,0.4,0.41,10.57,20.81
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,875,XB,2005.0,Action,Take-Two Interactive,1.26,0.61,0.0,0.09,1.95
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,2122,PC,2005.0,Action,Take-Two Interactive,0.0,0.92,0.0,0.05,0.98
3,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,9829,X360,2008.0,Action,Take-Two Interactive,0.08,0.03,0.0,0.01,0.12
4,grand theft auto v,2013,"Action, Crime, Drama",9.5,55593,17,PS3,2013.0,Action,Take-Two Interactive,7.01,9.27,0.97,4.14,21.4


In [20]:
df1=pd.DataFrame(mer_vidgame)

# <a name=EDA>Explratory data analysis(EDA)</a>


In [21]:
 # drop column that we don't use
df1.drop([ 'Year', 'Genre','Publisher'], axis=1, inplace=True)

In [22]:
df1.shape

(406, 12)

In [23]:
df1['game_type'].tolist()

['Action, Crime',
 'Action, Crime',
 'Action, Crime',
 'Action, Crime',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Comedy',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Drama',
 'Action, Adventure, Drama',
 'Horror, Mystery, Thriller',
 'Action, Adventure, Drama',
 'banned\n|\n\nAction, Adventure, Fantasy',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Adventure, Fantasy',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Crime, Drama',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure, Crime',
 'Action, Adventure',
 'Action, Adventure',
 'Action, Adventure',
 'Action, A

In [24]:

df1['game_type']=df1['game_type'].str.replace('banned\n|\n\n','')
df1['game_type']=df1['game_type'].str.replace('\n|\nAnnounced','')
df1['game_type']=df1['game_type'].str.replace('|','')
df1['game_type']=df1['game_type'].str.replace('Announced','')


  df1['game_type']=df1['game_type'].str.replace('banned\n|\n\n','')
  df1['game_type']=df1['game_type'].str.replace('\n|\nAnnounced','')
  df1['game_type']=df1['game_type'].str.replace('|','')


In [26]:
df1

Unnamed: 0,Name,year,game_type,rate,votes,Rank,Platform,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,18,PS2,9.43,0.40,0.41,10.57,20.81
1,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,875,XB,1.26,0.61,0.00,0.09,1.95
2,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,2122,PC,0.00,0.92,0.00,0.05,0.98
3,grand theft auto: san andreas,2004,"Action, Crime",9.4,37712,9829,X360,0.08,0.03,0.00,0.01,0.12
4,grand theft auto v,2013,"Action, Crime, Drama",9.5,55593,17,PS3,7.01,9.27,0.97,4.14,21.40
...,...,...,...,...,...,...,...,...,...,...,...,...
401,the wolf among us,2013,"Action, Crime, Drama",8.9,6572,11939,PSV,0.05,0.00,0.00,0.02,0.07
402,the wolf among us,2013,"Action, Crime, Drama",8.9,6572,13683,X360,0.01,0.03,0.00,0.00,0.04
403,true crime: new york city,2005,"Action, Adventure, Crime",7.0,897,2723,PS2,0.57,0.02,0.07,0.09,0.76
404,true crime: new york city,2005,"Action, Adventure, Crime",7.0,897,6244,XB,0.21,0.06,0.00,0.01,0.28


In [27]:
df1['year'] = pd.to_numeric(df1['year'], errors='coerce')


In [28]:
df1['rate'] = pd.to_numeric(df1['rate'], errors='coerce')


In [29]:
df1['votes'] = pd.to_numeric(df1['votes'], errors='coerce')


In [30]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406 entries, 0 to 405
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          406 non-null    object 
 1   year          400 non-null    float64
 2   game_type     406 non-null    object 
 3   rate          400 non-null    float64
 4   votes         400 non-null    float64
 5   Rank          406 non-null    int64  
 6   Platform      406 non-null    object 
 7   NA_Sales      406 non-null    float64
 8   EU_Sales      406 non-null    float64
 9   JP_Sales      406 non-null    float64
 10  Other_Sales   406 non-null    float64
 11  Global_Sales  406 non-null    float64
dtypes: float64(8), int64(1), object(3)
memory usage: 41.2+ KB


In [31]:
df1.describe()

Unnamed: 0,year,rate,votes,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,400.0,400.0,400.0,406.0,406.0,406.0,406.0,406.0,406.0
mean,2010.69,8.3615,10302.575,3300.448276,1.188498,0.775369,0.067291,0.294064,2.325493
std,4.482648,0.743539,9602.851064,3820.563391,1.731449,1.083115,0.185191,0.663309,3.225921
min,1997.0,6.4,536.0,17.0,0.0,0.0,0.0,0.0,0.01
25%,2008.0,8.0,4106.0,455.0,0.17,0.13,0.0,0.04,0.43
50%,2012.0,8.4,6951.0,1891.5,0.555,0.4,0.0,0.11,1.085
75%,2014.0,8.925,14673.0,4508.5,1.4075,0.9775,0.06,0.31,3.07
max,2023.0,9.7,55780.0,16375.0,9.67,9.27,2.02,10.57,21.4


In [32]:
df1.to_csv('merge_video_game.csv')