<h3> Using Selenium to Scrape Steam Reviews Data </h3>

In [1]:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException 
from selenium import webdriver

import re
import time
from datetime import datetime
import pandas as pd

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

import plotly.express as px
import matplotlib.pyplot as plt
import os

In [2]:
game_id = 281990
url_template = "https://steamcommunity.com/app/{}/reviews/?p=1&browsefilter=mostrecent&filterLanguage=english"
url = url_template.format(game_id)

print(url)

https://steamcommunity.com/app/281990/reviews/?p=1&browsefilter=mostrecent&filterLanguage=english


In [3]:
browser = webdriver.Safari()
browser.get(url)

In [4]:
def get_current_scroll_position(browser):
    return browser.execute_script("return window.pageYOffset;")

def scroll_to_bottom(browser):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

def get_steam_id(card):
    profile_url = card.find_element(By.XPATH,'.//div[@class="apphub_friend_block"]/a').get_attribute('href')
    steam_id = profile_url.split('/')[-1]
    return steam_id

def scrape_review_data(card):
    date_posted_element = card.find_element(By.XPATH,'.//div[@class="apphub_CardTextContent"]/div[@class="date_posted"]')
    date_posted = date_posted_element.text.strip()

    try:
        received_compensation_element = card.find_element(By.CLASS_NAME,"received_compensation").text
    except NoSuchElementException:
        received_compensation_element = ""

    card_text_content_element = card.find_element(By.CLASS_NAME,"apphub_CardTextContent")
    review_content = card_text_content_element.text.strip()
    excluded_elements = [date_posted, received_compensation_element]

    for excluded_element in excluded_elements:
        review_content = review_content.replace(excluded_element, "")
    review_content = review_content.replace("\n", "").replace("\t", "")

    review_length = len(review_content.replace(" ", ""))

    thumb_text = card.find_element(By.XPATH,'.//div[@class="reviewInfo"]/div[2]').text
    play_hours = card.find_element(By.XPATH,'.//div[@class="reviewInfo"]/div[3]').text

    return review_content, thumb_text, review_length, play_hours, date_posted

In [None]:
reviews = []
steam_ids_set = set()
max_scroll_attempts = 10

try:
    last_position = get_current_scroll_position(browser)
    running = True
    while running:
        cards = browser.find_elements(By.CLASS_NAME, 'apphub_Card')

        for card in cards[-30:]:
            steam_id = get_steam_id(card)
        
            if steam_id in steam_ids_set:
                continue
            else:
                review = scrape_review_data(card)
                reviews.append(review)
            
            #steam_ids_set.add(steam_id)
        
        scroll_attempt = 0
        while scroll_attempt < max_scroll_attempts:
            scroll_to_bottom(browser)
            curr_position = get_current_scroll_position(browser)

            if curr_position == last_position:
                scroll_attempt+=1
                time_sleep(3)

                if curr_position >=3:
                    running = False
                    break

            else:
                last_position = curr_position
                break
            
except Exception as e:
    print(e)
    
finally:
    browser.quit()

In [None]:
df = pd.DataFrame(reviews, columns = ['ReviewText', 'Review','ReviewLength','PlayHours', 'DatePosted'])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df['PlayHours'] = df['PlayHours'].str.replace("hrs on record", "")


In [None]:
df['PlayHours']

In [None]:
month_mapping = {
    'January':'01',
    'February':'02',
    'March':'03',
    'April':'04',
    'May':'05',
    'June':'06',
    'July':'07',
    'August':'08',
    'September':'09',
    'October':'10',
    'November':'11',
    'December':'12'
}

In [None]:
df[['Day', 'Month']] = df['DatePosted'].str.extract(r'(\d+) (\w+)', expand=True)

In [None]:
df.head()

In [None]:
df['Month']=df['Month'].map(month_mapping)

In [None]:
df.head(1)

In [None]:
df['DatePosted'] = df['Day'] + '/' + df['Month'] + '/2024'
df['DatePosted'] = pd.to_datetime(df['DatePosted'], format='%d/%m/%Y').dt.strftime('%d-%m-%Y')
df = df.drop(['Day', 'Month'], axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('Stellaris.csv', encoding='utf-8', sep=';', index=False)