<a href="https://colab.research.google.com/github/TasneemAhmed/Steam-Web-Scraping/blob/web-scraping/Steam_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this project, you'll get a chance to explore
**scraping data from Steam**
**a sample webpage on the site ( https://store.steampowered.com/tags/en/Action/).** 

1. Try extracting the names of the top games from this page.
2. What tags contain the prices? Can you extract the price information?
3. Get all of the header tags on the page.
4. Can you get the text from each span tag with class equal to "top_tag"?
5. Under the "Narrow by Tag" section, there are a collection of tags (e.g. "Indie", "Adventure", etc.). Write code to return these tags.


In [None]:
#importing packages
import requests
from bs4 import BeautifulSoup
import re

In [None]:
#sending get request & get html response
base_url = 'https://store.steampowered.com/tags/en/Action/#p=0&tab=NewReleases'
response = requests.get(base_url, params = {'page':1})
response.status_code

200

In [None]:
html  = response.content
html

b'\r\n<!DOCTYPE html>\r\n<html class=" responsive" lang="en">\r\n<head>\r\n\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\r\n\t\t\t<meta name="viewport" content="width=device-width,initial-scale=1">\r\n\t\t<meta name="theme-color" content="#171a21">\r\n\t\t<title>Action</title>\r\n\t<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">\r\n\r\n\t\r\n\t\r\n\t<link href="https://store.cloudflare.steamstatic.com/public/shared/css/motiva_sans.css?v=2C1Oh9QFVTyK&amp;l=english&amp;_cdn=cloudflare" rel="stylesheet" type="text/css" >\n<link href="https://store.cloudflare.steamstatic.com/public/shared/css/shared_global.css?v=Xn8C75dyhnl5&amp;l=english&amp;_cdn=cloudflare" rel="stylesheet" type="text/css" >\n<link href="https://store.cloudflare.steamstatic.com/public/shared/css/buttons.css?v=hFJKQ6HV7IKT&amp;l=english&amp;_cdn=cloudflare" rel="stylesheet" type="text/css" >\n<link href="https://store.cloudflare.steamstatic.com/public/css/v6/store.css?v=R_tx4e7-B

In [None]:
#create object from BeautifulSoup and passing the html response & lxml parser
soup = BeautifulSoup(html, 'lxml')

In [None]:
games_div = soup.find('div', attrs={'id':'NewReleasesRows'})
games_div

type(games_div)
games_tabs = games_div.find_all("a", {"class":"tab_item "})
#len(games_tabs)
games_tabs[0]

<a class="tab_item " data-ds-appid="1178830" data-ds-crtrids="[8438467]" data-ds-descids="[2,5]" data-ds-itemkey="App_1178830" data-ds-tagids="[19,492,1663,7208,3942,4345,4667]" href="https://store.steampowered.com/app/1178830/Bright_Memory_Infinite/?snr=1_241_4_action_103" onmouseout="HideGameHover( this, event, 'global_hover' )" onmouseover="GameHover( this, event, 'global_hover', {&quot;type&quot;:&quot;app&quot;,&quot;id&quot;:1178830,&quot;params&quot;:{&quot;bDisableHover&quot;:false},&quot;public&quot;:1,&quot;v6&quot;:1} );">
<div class="tab_item_cap">
<img class="tab_item_cap_img" src="https://cdn.cloudflare.steamstatic.com/steam/apps/1178830/capsule_184x69.jpg?t=1636678521"/>
</div>
<div class="discount_block tab_item_discount no_discount" data-price-final="1999"><div class="discount_prices"><div class="discount_final_price">$19.99</div></div></div> <div class="tab_item_content">
<div class="tab_item_name">Bright Memory: Infinite</div>
<div class="tab_item_details">
<span cla

In [None]:
#Try extracting the names of the top games from this page.

games_names = [game.find('div', {'class':'tab_item_name'}) for game in games_tabs]
games_names = [game.string for game in games_names]
games_names
#len(games_names)

['Bright Memory: Infinite',
 'Jurassic World Evolution 2',
 'SYNTHETIK 2',
 "Marvel's Guardians of the Galaxy",
 'Back 4 Blood',
 'ALTF4',
 'New World',
 'ELYON',
 'Demon Slayer -Kimetsu no Yaiba- The Hinokami Chronicles',
 'Prison Simulator',
 'Honkai Impact 3rd',
 'Gigapocalypse',
 'Crisis VRigade 2',
 'Killsquad',
 'Ancient Dungeon']

In [None]:
#What tags contain the prices? Can you extract the price information?

#extract final price of game
game_final_price = [div.find('div', {'class':'discount_final_price'}).string for div in games_tabs]

#replace values of 'Free to Play' with '0'
#re.search: you’ll check if the string contains a word Free. After this word, it may contain any combination of characters.
game_final_price = ['0' if re.search("Free_*", p) else p for p in game_final_price ]

#remove $
game_final_price = [p.strip('$') for p in game_final_price]

#cast price into int
game_final_price = [float(p) for p in game_final_price]
game_final_price

[19.99,
 59.99,
 17.99,
 59.99,
 59.99,
 2.99,
 39.99,
 0.0,
 59.99,
 19.99,
 0.0,
 6.69,
 15.99,
 19.99,
 19.99]

In [None]:
#Get all of the header tags on the page
top_tags = [game.find('div', {'class':'tab_item_top_tags'}).text for game in games_tabs]
top_tags[0]
#len(top_tags)

15

In [None]:
#Under the "Narrow by Tag" section, there are a collection of tags (e.g. "Indie", "Adventure", etc.)Write code to return these tags
Narrow_by_Tag_sec = soup.find_all('a', {'class':'btnv6_blue_hoverfade btn_small_tall '})
Narrow_by_Tag_sec[0]

<a class="btnv6_blue_hoverfade btn_small_tall " href="https://store.steampowered.com/tags/en/Action/492/?snr=1_241_4_action">
<span>
<div class="tag_count_button">
<span class="tag_name">Indie</span>
<span class="tag_count tab_filter_control_count">18,227</span>
</div>
</span>
</a>

In [None]:
narrow_tag_name = [a.find('span', {'class':'tag_name'}).string for a in Narrow_by_Tag_sec]
narrow_tag_name
#len(narrow_tag_name)

12

In [None]:
narrow_tag_count = [a.find('span', {'class':'tag_count tab_filter_control_count'}).string for a in Narrow_by_Tag_sec]
#to remove ',' use regular expression to replace any character with"" except the numbers[0-9]
narrow_tag_count = [re.sub(r"[^0-9]","",n) for n in narrow_tag_count]
narrow_tag_count = [int(n) for n in narrow_tag_count]
narrow_tag_count
#len(narrow_tag_count)

[18227, 12137, 11994, 8437, 6127, 5263, 4769, 4592, 4521, 4243, 3981, 3959]

# **Representing data in DataFrame**

In [None]:
import pandas as pd

In [None]:
games_info  = pd.DataFrame()

games_info['Name'] = games_names
games_info['Price'] = game_final_price
games_info['Top Tags'] = top_tags

games_info

Unnamed: 0,Name,Price,Top Tags
0,Bright Memory: Infinite,19.99,"Action, Indie, FPS, Female Protagonist"
1,Jurassic World Evolution 2,59.99,"Dinosaurs, Simulation, Building, Strategy"
2,SYNTHETIK 2,17.99,"Early Access, Action Roguelike, Top-Down Shoot..."
3,Marvel's Guardians of the Galaxy,59.99,"Action, Exploration, Action-Adventure, Third-P..."
4,Back 4 Blood,59.99,"Zombies, Action, Online Co-Op, Multiplayer"
5,ALTF4,2.99,"Singleplayer, 3D, Runner, Physics"
6,New World,39.99,"Massively Multiplayer, Open World, MMORPG, Adv..."
7,ELYON,0.0,"Action, MMORPG, Combat, Open World"
8,Demon Slayer -Kimetsu no Yaiba- The Hinokami C...,59.99,"Action, Anime, Adventure, 3D Fighter"
9,Prison Simulator,19.99,"RPG, Casual, Simulation, Action"


In [None]:
narrow_tags = pd.DataFrame()

narrow_tags['Tag Name'] = narrow_tag_name
narrow_tags['Tag Count'] = narrow_tag_count

narrow_tags

Unnamed: 0,Tag Name,Tag Count
0,Indie,18227
1,Singleplayer,12137
2,Adventure,11994
3,Casual,8437
4,2D,6127
5,Shooter,5263
6,RPG,4769
7,Multiplayer,4592
8,Arcade,4521
9,Atmospheric,4243


# **Exporting data into csv file**

In [None]:
#ExcelWriter: Class for writing DataFrame objects into excel sheets.
with pd.ExcelWriter('Trending Games Information.xlsx') as writer:  
    games_info.to_excel(writer, sheet_name='games_info')
    narrow_tags.to_excel(writer, sheet_name='narrow_tags_info')