# Scraping Steam reviews for God Of War

https://steamcommunity.com/app/1593500/reviews/?browsefilter=toprated&snr=1_5_100010_&p=1

In [59]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np

In [60]:
url = r"https://steamcommunity.com/app/1593500/reviews/?browsefilter=toprated&snr=1_5_100010_&p=1"

r = requests.get(url)
r.status_code

200

In [61]:
#TODO the page is and infinitely scrolling page with only 10 reviews without scrolling, this needs to be handled so that it fetches more values e.g., with selenium.

In [62]:
soup = bs(r.content, "html.parser")
review_divs = soup.findAll("div", class_="apphub_UserReviewCardContent") # reviews are on this div class

In [63]:
reviews = []
for div in review_divs:
    review_text = div.get_text().strip() # get div text and strip leading and trailing whitespace
    review_text = review_text.replace("\t", "") # strip remaining tabs from middle of text
    review_text = review_text.replace("\r", "") # strip carriage returns
    review_text = "\n".join(filter(None, review_text.split("\n"))) # strip line swaps while leaving one for further processing
    reviews.append(review_text)

In [64]:
reviews[0].split("\n") # reviewing the output

['1,839 people found this review helpful75 people found this review funny64',
 'Recommended',
 '46.3 hrs on record',
 'Posted: 1 November, 2022',
 "“Don't Be Sorry, Be Better.”"]

In [65]:
# creating regex patterns to parse out the data

pattern_helpful = r"\d+(?:,\d{3})*" # this will result in a list of numbers and we only want the "found helpful" number so we need to slice the list with [0]
pattern_hours = r"([\d.]+)" # also a list but with only one value, so slicing with [0] also

# we wont need a regex pattern for the date posted string since it starts with "Posted: " so we can use string.split("Posted: ")[1]


In [66]:
# Testing regex
test = reviews[0].split("\n")[0]
test_hours = reviews[0].split("\n")[2]

result_test = re.findall(pattern_helpful, test)[0].replace(",", "") # also replacing the dot
result_test_h = re.findall(pattern_hours, test_hours)[0]

print(test)
print(result_test)
print(test_hours)
print(result_test_h)

1,839 people found this review helpful75 people found this review funny64
1839
46.3 hrs on record
46.3


In [67]:
# Assigning the data to a Pandas dataframe

# creating an empty list to store dictionaries
data_list = []

# for loop for adding the data to the empty dataframe
for review in reviews:

    review_s = review.split("\n")

    helpful = re.findall(pattern_helpful, review_s[0])[0].replace(",", "") # the amount of ppl who found the review helpfull is on the first element.
    recommend = review_s[1] == "Recommended" # The recommended status is on the 2nd element, we will use true/false for this field.
    hours = re.search(pattern_hours, review_s[2])[0] # hours on record is on the 3rd element
    post_date = review_s[3].split("Posted: ")[1] # we will use the split method so no need for regex.
    review_text = review_s[4] # and for the review text itself, its on the last element.

    # creating a temporary dictionary for the values
    temp = {"Found helpful": int(helpful),
                  "Recommend": recommend,
                  "Hours on record": float(hours),
                  "Date posted": post_date,
                  "Review text": review_text}

    data_list.append(temp) # appending the temporary dictionary to a list of data

df = pd.DataFrame(data_list) # creating the pandas dataframe from the list of dictionaries.

In [68]:
df

Unnamed: 0,Found helpful,Recommend,Hours on record,Date posted,Review text
0,1839,True,46.3,"1 November, 2022","“Don't Be Sorry, Be Better.”"
1,4301,True,42.9,"8 November, 2022",Bring God Of War Ragnarok on PC
2,1860,True,18.2,"14 January, 2022",It's rare to see this much polish (on a consol...
3,7225,True,36.4,"17 January, 2022","No additional account, no unnecessary launcher..."
4,2340,True,11.3,"22 December, 2022",hi
5,2053,True,56.7,"24 January, 2022","SONY, DO YOU LIKE MONEY? GoW has been in the t..."
6,1363,True,79.6,"16 January, 2022",god of war is one of those games i wish i coul...
7,3332,True,43.9,"25 October, 2022",Please bring Ragnarok to PC! With how well its...
8,1889,True,20.8,"23 November, 2022",Please for the love of God port more games ove...
9,2842,True,59.1,"16 January, 2022",No extra launcher? No signing into a Bethesda ...
