# Scraping Steam reviews for God Of War

https://steamcommunity.com/app/1593500/reviews/?browsefilter=toprated&snr=1_5_100010_&p=1

In [6]:

from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time


In [12]:
url = r"https://steamcommunity.com/app/1593500/reviews/?browsefilter=toprated&snr=1_5_100010_&p=1"


In [15]:
# Settign up the language the browser opens up with Selenium.
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {"intl.accept_languages": "en-US"})

In [18]:
# Scrolling the page with selenium before grabbing the HTML, without scrolling the site only contains 10 reviews.
driver = webdriver.Chrome(options=chrome_options, service=Service(ChromeDriverManager().install()))
driver.get(url)
driver.implicitly_wait(5)
body = driver.find_element(By.CSS_SELECTOR, "body")

no_pages_down = 100 # the amount of pages down

# A while loop to perform a set amount of page down operations. Since there are almost 100k reviews, we will not grab them all.
# If we were to grab all of the reviews we would modify the loop so that it would scroll down until the screen height doesnt change, and break the loop there.
while no_pages_down:
    body.send_keys(Keys.PAGE_DOWN) # Instruct Selenium to press page down
    no_pages_down -= 1 # decrementing the remaining amount of pages down
    time.sleep(3) # giving time for the browser to load the new reviews

html = driver.page_source # assigning the page html to a variable.

In [19]:
soup = bs(html)
review_divs = soup.findAll("div", class_="apphub_UserReviewCardContent") # reviews are on this div class

In [20]:
reviews = []
for div in review_divs:
    review_text = div.get_text().strip() # get div text and strip leading and trailing whitespace
    review_text = review_text.replace("\t", "") # strip remaining tabs from middle of text
    review_text = review_text.replace("\r", "") # strip carriage returns
    review_text = "\n".join(filter(None, review_text.split("\n"))) # strip line swaps while leaving one for further processing
    reviews.append(review_text)

In [21]:
reviews[0].split("\n") # reviewing the output

['1,839 people found this review helpful75 people found this review funny64',
 'Recommended',
 '46.3 hrs on record',
 'Posted: 1 November, 2022',
 "“Don't Be Sorry, Be Better.”"]

In [22]:
# creating regex patterns to parse out the data

pattern_helpful = r"\d+(?:,\d{3})*" # this will result in a list of numbers and we only want the "found helpful" number so we need to slice the list with [0]
pattern_hours = r"([\d.]+)" # also a list but with only one value, so slicing with [0] also

# we wont need a regex pattern for the date posted string since it starts with "Posted: " so we can use string.split("Posted: ")[1]


In [23]:
# Testing regex
test = reviews[0].split("\n")[0]
test_hours = reviews[0].split("\n")[2]

result_test = re.findall(pattern_helpful, test)[0].replace(",", "") # also replacing the dot
result_test_h = re.findall(pattern_hours, test_hours)[0]

print(test)
print(result_test)
print(test_hours)
print(result_test_h)

1,839 people found this review helpful75 people found this review funny64
1839
46.3 hrs on record
46.3


In [24]:
# Assigning the data to a Pandas dataframe

# creating an empty list to store dictionaries
data_list = []

# for loop for adding the data to the empty dataframe
for review in reviews:

    review_s = review.split("\n")

    helpful = re.findall(pattern_helpful, review_s[0])[0].replace(",", "") # the amount of ppl who found the review helpfull is on the first element.
    recommend = review_s[1] == "Recommended" # The recommended status is on the 2nd element, we will use true/false for this field.
    hours = re.search(pattern_hours, review_s[2])[0] # hours on record is on the 3rd element
    post_date = review_s[3].split("Posted: ")[1] # we will use the split method so no need for regex.
    review_text = review_s[4] # and for the review text itself, its on the last element.

    # creating a temporary dictionary for the values
    temp = {"Found helpful": int(helpful),
                  "Recommend": recommend,
                  "Hours on record": float(hours),
                  "Date posted": post_date,
                  "Review text": review_text}

    data_list.append(temp) # appending the temporary dictionary to a list of data

df = pd.DataFrame(data_list) # creating the pandas dataframe from the list of dictionaries.

In [25]:
# Reviewing the Pandas Dataframe
df

Unnamed: 0,Found helpful,Recommend,Hours on record,Date posted,Review text
0,1839,True,46.3,"1 November, 2022","“Don't Be Sorry, Be Better.”"
1,4301,True,42.9,"8 November, 2022",Bring God Of War Ragnarok on PC
2,1860,True,18.2,"14 January, 2022",It's rare to see this much polish (on a consol...
3,7225,True,36.4,"17 January, 2022","No additional account, no unnecessary launcher..."
4,2340,True,11.3,"22 December, 2022",hi
...,...,...,...,...,...
325,99,False,47.9,"June 17, 2022",A major downgrade vs the old series.It's more ...
326,9,True,5.0,"January 14, 2022","Probably the best game I have ever bought, if ..."
327,17,True,43.5,"April 8, 2022","Once you start playing this game, then you wil..."
328,11,True,5.1,"January 14, 2022",The best PS4 game is now on PC and its the bes...
