In [1]:
import requests
import urllib
import re
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import secrets
import html2text
import unicodedata

In [6]:

pd.set_option('display.max_columns', None)

# Returns pandas array of reviews from requested steam url.
def get_reviews(url):
    # Initialize dataframe.
    df = pd.DataFrame(columns=['num_prod', 'num_rev', 'rate', 'hours', 'date', 'review', 'helpful', 'funny'])
    n = 0
    
    # Keeps track of number of reviews.
    review_num = 0

    # Requests from Steamworks API and crawls page.
    for i in range(5, 500):
        new_url = _get_request_url(url, i)
        print(new_url)
        page = requests.get(new_url)
        soup = BeautifulSoup(page.content, 'html.parser')
        # Cleaning all the excessive \t and \n from API response.
        clean_text = page.text.replace('\\t', '').replace('\\n', '').replace('\\r', '').replace('\\/', '/')
        # Translate HTML code to remove the element tags and clear more room to see content.
        html_text = html2text.html2text(clean_text)
        
        # Figures out number of products in account, number of reviews posted, rating (R or NR), number of hours on record
        #    date posted, review content, helpful votes, and funny votes.
        num_prod = 0
        num_rev = 0
        rate = -1
        hours = 0
        date = ''
        review = ''
        helpful = 0
        funny = 0
        record_review = False
        
        # Reads the cleared HTML file line by line and extracts data.
        for i in html_text.splitlines():
            # If the line is empty then just ignore it.
            if len(i) == 0:
                continue
            
            # Removes any excess parenthesis (mostly weblinks) and brackets.
            line = re.sub(r" ?\[[^)]+\]", " ", re.sub(r" ?\([^)]+\)", "", i))
            
            # Searches for the number of products owned line since this is the first line of each review of a user.
            # We use this line as a start point for all reviews.
            if 'products' in line:
                data = np.array([num_prod, num_rev, rate, hours, date, review, helpful, funny])
                df.loc[n] = data
                n += 1
                temp = re.sub("\D", "", line)
                if not temp == '' and int(temp) > 0:
                    num_prod = int(temp)
                num_rev = 0
                rate = -1
                hours = 0
                date = ''
                review = ''
                helpful = 0
                funny = 0
                continue
                
            if num_rev != 0:
                if rate != -1:
                    if hours != 0:
                        if date != '':
                            if "Was this review helpful" in line:
                                record_review = False
                                review = review.encode('utf-8').decode('ascii','ignore')
                            if not record_review:
                                if helpful != 0:
                                    if 'found this review funny' in line:
                                        funny = re.sub("\D", "", line)
                                else:
                                    if 'found this review helpful' in line:
                                        helpful = re.sub("\D", "", line)
                            else:
                                review += line + ' '
                            if "Was this review helpful" in line:
                                record_review = False
                        else:
                            if 'Post' in line:
                                date = ' '.join(line.split(' ')[1:])
                                record_review = True
                    else:
                        if 'Recom' in line:
                            hours = int(re.sub("\D", "", line.split('.')[0]))
                else:
                    if 'Not' in line:
                        rate = 0
                    elif 'Recom' in line:
                        rate = 1
                        hours = int(re.sub("\D", "", line.split('.')[0]))
            else:
                temp = re.sub("\D", "", line)
                if 'Not' in line:
                    num_rev = 1
                    rate = 0
                elif 'Recom' in line:
                    num_rev = 1
                    rate = 1
                    hours = int(re.sub("\D", "", line.split('.')[0]))
                elif not temp == '' and int(temp) > 0:
                    num_rev = int(temp)
                    
        data = np.array([num_prod, num_rev, rate, hours, date, review, helpful, funny])
        df.loc[n] = data
        n += 1
        # Adds time to not overstress servers, following Steam guidelines of 200 requests per 5 minute.
        time.sleep(2)
        break

    print("Finished recording " + str(df.shape[0]) + " reviews.")
    return df.drop(0)

# Stores pandas array into filename specified.
def store_reviews(df, filename):
    df.to_csv(filename)
    print("Successfully stored!")

# Returns the request url for the nth page of reviews.
def _get_request_url(url, n):
    n = str(n * 20)
    return url + n
get_reviews(secrets.DOTA_URL)
#store_reviews(get_reviews(secrets.DOTA_URL), secrets.DOTA_FILE)
#store_reviews(get_reviews(secrets.PUBG_URL), secrets.PUBG_URL)
#store_reviews(get_reviews(secrets.CSGO_URL), secrets.CSGO_FILE)

https://store.steampowered.com/appreviews/570?filter=all&language=english&day_range=all&review_type=all&purchase_type=all&start_offset=100
Finished recording 21 reviews.


Unnamed: 0,num_prod,num_rev,rate,hours,date,review,helpful,funny
1,53,1,1,3581,"September 2, 2012",its k,218,2
2,17,1,1,4264,"May 30, 2017","If you want to have a great life with jobs, mo...",2226,828
3,1,1,1,3161,February 3,Ruins your social life.Makes you sad and happy...,221,193
4,109,1,1,1513,"May 27, 2017",You either quit while you are the feeder or pl...,105,43
5,492,1,1,1792,"June 26, 2014","If you enjoyed Metro 2033 or Metro Last Light,...",229,41
6,88,1,1,3543,"May 26, 2017",Skipped classes thanks to Dota. Learned to g...,513,463
7,14,1,1,6102,April 13,"""You learn how to learn, how to lose and most ...",383,57
8,580,8,1,3548,"December 16, 2013","Causes anxiety problems, low self esteem, Tour...",13831,199
9,1,1,1,5859,June 22,"When it came to dota, I played hard to climb u...",144,49
10,24,4,1,5834,"July 30, 2014",good,62,23
