In [None]:
import os
import time
import sys
import duckdb

In [None]:
# Filtered out for only english reviews
path = f'{os.path.dirname(os.getcwd())}/data/steam/english_review.parquet'
db = duckdb.connect(":memory:")
db.sql(f"CREATE TABLE data AS SELECT * FROM read_parquet('{path}')")

In [None]:
# Filtered out for only cleaned english reviews
path = f'{os.getcwd()}/cleaned_english_reviews.parquet'
db = duckdb.connect(":memory:")
db.sql(f"CREATE TABLE data AS SELECT * FROM read_parquet('{path}')")

In [4]:
db.sql("SELECT * FROM data LIMIT 10")

┌────────┬───────────────────┬────────────────┬──────────────────────┬────────────────────┬─────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

### Last 2 Week Playtime

Some older reviews may contain inaccurate playtimes 

[Steam users discussing issue](https://steamcommunity.com/groups/SteamClientBeta/discussions/0/3546050190331497294/)

In [5]:
db.sql(f"SELECT * FROM data WHERE total_playtime < last_2_week_playtime")

┌─────────┬───────────────────┬────────────────┬──────────────────────┬────────────────────┬─────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────┬──────────┬─────────────┬─────────────────────┬───────────────┬───────────────────────┬───────────────────────────┐
│ app_id  │      user_id      │ total_playtime │ last_2_week_playtime │ playtime_at_review │ last_played │                                                                                                                                                                  

In [6]:
# Check for more playtime than possible 60*24*14 = 20160 minutes in 2 weeks
db.sql(f"SELECT * FROM data WHERE last_2_week_playtime > 20160")

┌────────┬───────────────────┬────────────────┬──────────────────────┬────────────────────┬─────────────┬──────────────────────────┬──────────┬──────────┬─────────────┬─────────────────────┬───────────────┬───────────────────────┬───────────────────────────┐
│ app_id │      user_id      │ total_playtime │ last_2_week_playtime │ playtime_at_review │ last_played │           text           │ voted_up │ votes_up │ votes_funny │ weighted_vote_score │ comment_count │ steam_purchase_status │ write_during_early_access │
│ int64  │       int64       │     int64      │        int64         │       int64        │    int64    │         varchar          │  int64   │  int64   │    int64    │       double        │     int64     │         int64         │           int64           │
├────────┼───────────────────┼────────────────┼──────────────────────┼────────────────────┼─────────────┼──────────────────────────┼──────────┼──────────┼─────────────┼─────────────────────┼───────────────┼─────────────────

### Verify Review Content

In [7]:
db.sql("SELECT * FROM data WHERE LENGTH(TEXT) = 0")

┌────────┬─────────┬────────────────┬──────────────────────┬────────────────────┬─────────────┬─────────┬──────────┬──────────┬─────────────┬─────────────────────┬───────────────┬───────────────────────┬───────────────────────────┐
│ app_id │ user_id │ total_playtime │ last_2_week_playtime │ playtime_at_review │ last_played │  text   │ voted_up │ votes_up │ votes_funny │ weighted_vote_score │ comment_count │ steam_purchase_status │ write_during_early_access │
│ int64  │  int64  │     int64      │        int64         │       int64        │    int64    │ varchar │  int64   │  int64   │    int64    │       double        │     int64     │         int64         │           int64           │
├────────┴─────────┴────────────────┴──────────────────────┴────────────────────┴─────────────┴─────────┴──────────┴──────────┴─────────────┴─────────────────────┴───────────────┴───────────────────────┴───────────────────────────┤
│                                                                       

In [8]:
db.sql("SELECT * FROM data WHERE LENGTH(text) < 2 ORDER BY LENGTH(text) DESC")

┌────────┬─────────┬────────────────┬──────────────────────┬────────────────────┬─────────────┬─────────┬──────────┬──────────┬─────────────┬─────────────────────┬───────────────┬───────────────────────┬───────────────────────────┐
│ app_id │ user_id │ total_playtime │ last_2_week_playtime │ playtime_at_review │ last_played │  text   │ voted_up │ votes_up │ votes_funny │ weighted_vote_score │ comment_count │ steam_purchase_status │ write_during_early_access │
│ int64  │  int64  │     int64      │        int64         │       int64        │    int64    │ varchar │  int64   │  int64   │    int64    │       double        │     int64     │         int64         │           int64           │
├────────┴─────────┴────────────────┴──────────────────────┴────────────────────┴─────────────┴─────────┴──────────┴──────────┴─────────────┴─────────────────────┴───────────────┴───────────────────────┴───────────────────────────┤
│                                                                       

### Language Marked Incorrectly


[regex modified from Michael G.](https://stackoverflow.com/questions/8961833/regular-expression-to-match-english-words-with-some-other-characters)

In [9]:
# Tries to use regex find reviews using non ascii characters
db.sql(r"SELECT text FROM data WHERE text SIMILAR TO '^[\s\w\d\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e‘’“”–—]*$' = false")

┌─────────┐
│  text   │
│ varchar │
├─────────┤
│ 0 rows  │
└─────────┘

In [10]:
# Check for mutiple reviews by same user
db.sql("""
    WITH dup_count AS (
        SELECT app_id, user_id, COUNT(1) AS rev_num FROM data GROUP BY app_id, user_id HAVING rev_num > 1
    )
       
    SELECT data.app_id, data.user_id, data.text FROM data JOIN dup_count ON data.app_id = dup_count.app_id AND data.user_id = dup_count.user_id
""")

┌────────┬─────────┬─────────┐
│ app_id │ user_id │  text   │
│ int64  │  int64  │ varchar │
├────────┴─────────┴─────────┤
│           0 rows           │
└────────────────────────────┘

### Cleaning Data Further

In [None]:
# Ignore playtime issue, shouldnt mess with review analytics

# Remove reviews using other characters and emojis 
db.sql(r"""
    COPY (
        WITH filter_content AS (
            SELECT 
                app_id, 
                user_id,
                total_playtime,
                last_2_week_playtime,
                playtime_at_review,
                last_played,
                text,
                voted_up,
                votes_up,
                votes_funny,
                weighted_vote_score,
                comment_count,
                steam_purchase_status,
                write_during_early_access
            FROM 
                data 
            WHERE 
                -- Remove reviews using non standard characters and emojis
                text SIMILAR TO '^[\s\w\d\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e‘’“”–—]*$'
                -- Remove reviews that are too short 
                AND LENGTH(text) > 1
        ), mark_reviews AS (
            SELECT 
                app_id, 
                user_id,
                total_playtime,
                last_2_week_playtime,
                playtime_at_review,
                last_played,
                text,
                voted_up,
                votes_up,
                votes_funny,
                weighted_vote_score,
                comment_count,
                steam_purchase_status,
                write_during_early_access,
                -- Get Most Common Review (Highest Votes)
                ROW_NUMBER() OVER (PARTITION BY app_id, user_id ORDER BY votes_up DESC) AS r
            FROM
                filter_content
        )
        SELECT
            app_id, 
            user_id,
            total_playtime,
            last_2_week_playtime,
            playtime_at_review,
            last_played,
            text,
            voted_up,
            votes_up,
            votes_funny,
            weighted_vote_score,
            comment_count,
            steam_purchase_status,
            write_during_early_access
        FROM
            mark_reviews
        WHERE
            r = 1
    )
    TO 'cleaned_english_reviews.parquet'
    WITH (FORMAT PARQUET);
""")