In [9]:
#Selenium 4.9.0
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

import re
import time
from datetime import datetime
import pandas as pd

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

import plotly.express as px
import matplotlib.pyplot as plt

game_id = 291550
url_template = "https://steamcommunity.com/app/{}/reviews/?p=1&browsefilter=mostrecent&filterLanguage=english"
url = url_template.format(game_id)

print(url)

options = EdgeOptions()

language = "en-US"
options.add_argument(f"--lang={language}")

service = Service("./msedgedriver.exe")
driver = webdriver.Edge(service=service, options=options)
driver.maximize_window()
driver.get(url)

def get_current_scroll_position(driver):
    return driver.execute_script("return window.pageYOffset;")

def scroll_to_bottom(driver):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

def get_steam_id(card):
    profile_url = card.find_element(By.XPATH, './/div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')
    steam_id = profile_url.split('/')[-2]
    return steam_id

def scrape_review_data(card):
    date_posted_element = card.find_element(By.XPATH, './/div[@class="apphub_CardTextContent"]/div[@class="date_posted"]')
    date_posted = date_posted_element.text.strip()
    
    try:
        early_access_review_element = card.find_element(By.XPATH, './/div[@class="apphub_CardTextContent"]/div[@class="early_access_review"]')
        early_access_review = early_access_review_element.text.strip()
    except NoSuchElementException:
        early_access_review = ""

    try:
        received_compensation_element = card.find_element(By.CLASS_NAME, "received_compensation").text
    except NoSuchElementException:
        received_compensation_element = ""

    card_text_content_element = card.find_element(By.CLASS_NAME, "apphub_CardTextContent")
    review_content = card_text_content_element.text.strip()
    excluded_elements = [date_posted, early_access_review, received_compensation_element]

    for excluded_element in excluded_elements:
        review_content = review_content.replace(excluded_element, "")
    review_content = review_content.replace("\n", "")

    review_length = len(review_content.replace(" ", ""))

    thumb_text = card.find_element(By.XPATH, './/div[@class="reviewInfo"]/div[2]').text
    play_hours = card.find_element(By.XPATH, './/div[@class="reviewInfo"]/div[3]').text

    return early_access_review, review_content, thumb_text, review_length, play_hours, date_posted

reviews = []
steam_ids_set = set()

try:
    cards = driver.find_elements(By.CLASS_NAME, 'apphub_Card')

    for card in cards[:5]:  # Only process the first 5 cards
        steam_id = get_steam_id(card)
        if steam_id not in steam_ids_set:
            review = scrape_review_data(card)
            reviews.append(review)
            steam_ids_set.add(steam_id)
        
        if len(reviews) == 5:  # Stop after collecting 5 reviews
            break

except Exception as e:
    print(e)

finally:
    driver.quit()


df = pd.DataFrame(reviews, columns=['EarlyAccess', 'ReviewText', 'Review', 'ReviewLength', 'PlayHours', 'DatePosted'])
df['PlayHours'] = df['PlayHours'].map(lambda x: re.sub(' hrs on record', '', x)).astype(float)
df['DatePosted'] = df['DatePosted'].map(lambda x: re.sub('Posted: ', '', x))

month_mapping = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12',
}

df[['Day', 'Month']] = df['DatePosted'].str.extract(r'(\d+) (\w+)', expand=True)
df['Month'] = df['Month'].map(month_mapping)

df['DatePosted'] = df['Day'] + '/' + df['Month'] + '/2024'
df['DatePosted'] = pd.to_datetime(df['DatePosted'], format='%d/%m/%Y').dt.strftime('%d-%m-%Y')
df = df.drop(['Day', 'Month'], axis=1)

df.to_csv('steam_reviews.csv', encoding='utf-8', sep=';', index=False)


                                      

https://steamcommunity.com/app/291550/reviews/?p=1&browsefilter=mostrecent&filterLanguage=english


SessionNotCreatedException: Message: session not created
from unknown error: cannot find msedge binary
Stacktrace:
0   msedgedriver                        0x0000000100463660 msedgedriver + 4552288
1   msedgedriver                        0x000000010045b2ac msedgedriver + 4518572
2   msedgedriver                        0x00000001000492bc msedgedriver + 250556
3   msedgedriver                        0x0000000100074620 msedgedriver + 427552
4   msedgedriver                        0x0000000100072d08 msedgedriver + 421128
5   msedgedriver                        0x00000001000aeedc msedgedriver + 667356
6   msedgedriver                        0x00000001000ae728 msedgedriver + 665384
7   msedgedriver                        0x000000010007d5e8 msedgedriver + 464360
8   msedgedriver                        0x000000010007de74 msedgedriver + 466548
9   msedgedriver                        0x0000000100427f24 msedgedriver + 4308772
10  msedgedriver                        0x000000010042ce54 msedgedriver + 4329044
11  msedgedriver                        0x000000010040bb8c msedgedriver + 4193164
12  msedgedriver                        0x000000010042d4ec msedgedriver + 4330732
13  msedgedriver                        0x00000001003fd17c msedgedriver + 4133244
14  msedgedriver                        0x000000010044a9c8 msedgedriver + 4450760
15  msedgedriver                        0x000000010044ab18 msedgedriver + 4451096
16  msedgedriver                        0x000000010045aef4 msedgedriver + 4517620
17  libsystem_pthread.dylib             0x0000000183479f94 _pthread_start + 136
18  libsystem_pthread.dylib             0x0000000183474d34 thread_start + 8


In [21]:
#Stopwords
def remove_stopwords(sentences, stopwords):
    filtered_words = [item for item in sentences if item not in stopwords]
    return ' ',join(filtered_words)

stopwords = stopwords.words('english')
df['cleanedReviewText'] = df['ReviewText'].astype(str).apply(lambda x: remove_stopwords(x.split(), stopwords))

#Tokenization
example = df['cleanedReviewText'][1]

tokens = nltk.word_tokenize(example)

tagged = nltk.pos_tag(tokens)

#Sentiment analysis
from nltk.sentiment import SentimentIntensityAnalyzer

sie = SentimentIntensityAnalyzer()

df['PolarityScores'] = [sie.polarity_scores(x)['compound'] for x in df['cleanedReviewText']]

#Results
df['ReviewValue'] = df['Review'].replace({'Recommended': 1, 'Not Recommended': 0})

df[['ReviewValue', 'PolarityScores']].corr(method='pearson')

px.box(df, x='Review', y='PolarityScores')

def plot_wordcloud(series, output_filename='wordcloud'):
    wordcloud = WordCloud().generate(' '.join(series.astype(str)))
    wordcloud.to_file(output_filename + '.png')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
plot_wordcloud(df['cleanedReviewText'])

In [None]:
reviews = []
steam_ids_set = set()
max_scroll_attempts = 5

try:
    last_position = get_current_scroll_position(driver)
    running = True
    while running:
        cards = driver.find_elements(By.CLASS_NAME, 'apphub_Card')

        for card in cards[-20:]:
            steam_id = get_steam_id(card)
            if steam_id in steam_ids_set:
                continue
            else:
                review = scrape_review_data(card)
                reviews.append(review)

        scroll_attempt = 0
        while scroll_attempt < max_scroll_attempts:
            scroll_to_bottom(driver)
            curr_position = get_current_scroll_position(driver)

            if curr_position == last_position:
                scroll_attempt += 1
                time.sleep(3)

                if curr_position >=3:
                    running = False
                    break
            else:
                last_position = curr_position
                break
                
except Exception as e:
    print(e)

finally:
    driver.quit()
