In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv
import re

In [7]:
service = Service(ChromeDriverManager().install())
options = Options()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(service=service, options=options)

Para pegar a URL dos reviews do jogo basta pesquisar o jogo na steam, entrar na página, pegar o ID que aparece em seguida de `/app/`

Ex:
URL da steam para o fifa: `https://store.steampowered.com/app/1506830/FIFA_22/`

O ID é  `1506830`, e o link para os reviews é https://steamcommunity.com/app/1506830/reviews/?browsefilter=toprated

In [None]:
# Replace with your game's review URL
URL = "https://steamcommunity.com/app/2669320/reviews/?browsefilter=toprated"

# URL += "&filterLanguage=brazilian" # To use pt-br filter

reviews = []
seen_reviews = set()
# Variable to keep track of the last seen card, to avoid duplicates

def get_review_data(scroll_count, last_seen_card):
    review_cards = driver.find_elements(By.CLASS_NAME, "apphub_Card")
    print(f"Found {len(review_cards)} review cards on scroll {scroll_count + 1}")

    for card in review_cards[last_seen_card:]:
        try:
            # Get recommendation (Recommended or Not Recommended)
            recommendation = card.find_element(By.CLASS_NAME, "title").text.strip()

            review_text = card.find_element(By.CLASS_NAME, "apphub_CardTextContent").text.strip()

            # Relevance
            relevance = card.find_element(By.CLASS_NAME, "found_helpful").text.strip()
            number_of_helpful = re.search(r"(\d[\d,]*) people found this review helpful", relevance)
            extracted_number = 0
            if number_of_helpful:
                extracted_number = int(number_of_helpful.group(1).replace(",", ""))
            

            if review_text and review_text not in seen_reviews:
                seen_reviews.add(review_text)
                reviews.append({"text": review_text, "recommendation": recommendation, "helpful": extracted_number})
        except Exception as e:
            print("Error processing a review card:", e)
    
    return len(review_cards)


driver.get(URL)
time.sleep(3)
    
SCROLL_PAUSE = 1
# Number of times to scroll down the page, use inf to scroll until the end
# N_SCROLLS = float("inf")
N_SCROLLS = 40
last_height = driver.execute_script("return document.body.scrollHeight")
last_seen_card = 0


print("Scrolling and collecting reviews...")
for i in range(N_SCROLLS):
    # Scroll to the bottom of the page, only if not the first scroll
    if i > 0:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("No more new content loaded. Stopping scroll.")
            break
        last_height = new_height
    
    last_seen_card = get_review_data(i, last_seen_card)

print(f"Collected {len(reviews)} reviews.")   

driver.quit()

reviews[0:10]

Scrolling and collecting reviews...
Found 10 review cards on scroll 1
Found 20 review cards on scroll 2
Found 30 review cards on scroll 3
Found 40 review cards on scroll 4
Found 50 review cards on scroll 5
Found 60 review cards on scroll 6
Found 70 review cards on scroll 7
Found 80 review cards on scroll 8
Found 90 review cards on scroll 9
Found 100 review cards on scroll 10
Found 110 review cards on scroll 11
Found 120 review cards on scroll 12
Found 130 review cards on scroll 13
Found 140 review cards on scroll 14
Found 150 review cards on scroll 15
Found 160 review cards on scroll 16
Found 170 review cards on scroll 17
Found 180 review cards on scroll 18
Found 190 review cards on scroll 19
Found 200 review cards on scroll 20
Found 210 review cards on scroll 21
Found 220 review cards on scroll 22
Found 230 review cards on scroll 23
Found 240 review cards on scroll 24
Found 250 review cards on scroll 25
Found 260 review cards on scroll 26
Found 270 review cards on scroll 27
Found 280 

[{'text': "Posted: 19 November, 2024\nTL:DR - This game is a psychologically manipulative Skinner box. It is designed to be addictive, and to get as much money out of you as possible. Scripting, implemented to encourage FUT players to spend more money, is in every mode and makes every game an unsatisfying and empty experience. Considering a core demographic for this game is children, the income-focused manipulation displayed here is repugnant.\n\nDDA (dynamic difficulty adjustment) or scipting is just ridiculous now. In games against EA's garbage AI, if the game wants your opponent to score, they'll respond to your controller inputs instantly, not the actions of the player you're controlling. Easy tackles with world class defenders will either miss, bounce the ball to another opponent, just outright phase through the ball or even the damn player! Your keeper will do a Matrix-esque dodge of a shot coming directly at them or will spill an easy catch directly into their striker. The most 

## Cleaning

### Removing the date from the review

Pattern `"Posted: <day> <month>[, <year>]\n"`

### Remove aditional spaces

Remove extra spaces - This makes it easier to read the resulting CSV, cuts characters that are not usefull from the output and needs to be done anyway for the data analysis.

In [11]:
def clean_text(text):
    text = re.sub(r"\s*♥+\s*", " ", text)  # Remove hearts that censor bad words on steam
    text = re.sub(r'\s+', ' ', text)  
    return text.strip()

def remove_posted_date(text):
    lines = text.splitlines()
    if lines and lines[0].startswith("Posted:"):
        return "\n".join(lines[1:]).strip()
    return text.strip()

for review in reviews:
    review["text"] = remove_posted_date(review["text"])
    review["text"] = clean_text(review["text"])

reviews[0:10]


[{'text': "TL:DR - This game is a psychologically manipulative Skinner box. It is designed to be addictive, and to get as much money out of you as possible. Scripting, implemented to encourage FUT players to spend more money, is in every mode and makes every game an unsatisfying and empty experience. Considering a core demographic for this game is children, the income-focused manipulation displayed here is repugnant. DDA (dynamic difficulty adjustment) or scipting is just ridiculous now. In games against EA's garbage AI, if the game wants your opponent to score, they'll respond to your controller inputs instantly, not the actions of the player you're controlling. Easy tackles with world class defenders will either miss, bounce the ball to another opponent, just outright phase through the ball or even the damn player! Your keeper will do a Matrix-esque dodge of a shot coming directly at them or will spill an easy catch directly into their striker. The most outrageous though, is player s

In [12]:

with open("steam_reviews.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["Recommendation", "Review", "Helpful"])
    writer.writeheader()
    for review in reviews:
        writer.writerow({
            "Recommendation": review["recommendation"],
            "Review": review["text"],
            "Helpful": review["helpful"]
        })


print(f"Scraped {len(reviews)} reviews.")

Scraped 400 reviews.
