In [2]:
import pandas as pd
import re, ast, json
from datetime import datetime

In [1]:
DATASET_FILE_PATH = "australian_user_reviews.json"

In [3]:
def parse_steam_date(date_str):
    date_str = date_str.replace("Posted ", "").replace(".", "")
    
    if re.search(r'\d{4}$', date_str):
        fmt = "%B %d, %Y"
    else:
        date_str += ", 2018" 
        fmt = "%B %d, %Y"
        
    try:
        return datetime.strptime(date_str, fmt)
    except ValueError:
        return None 

In [4]:
def parse_helpful(helpful_str):
    if "No ratings yet" in helpful_str:
        return 0.0
    
    match = re.search(r'(\d+) of (\d+)', helpful_str)
    if match:
        numerator = int(match.group(1))
        denominator = int(match.group(2))
        if denominator == 0: return 0.0
        return numerator / denominator
    return 0.0

In [5]:
with open(DATASET_FILE_PATH, 'r', encoding='utf-8') as f:
    data = [ast.literal_eval(line) for line in f]

In [7]:
rows = []
for user in data:
    user_id = user['user_id']
    for review in user['reviews']:
        rows.append({
            'user_id': user_id,
            'item_id': review['item_id'],
            'recommend': 1 if review['recommend'] else 0,
            'date': parse_steam_date(review['posted']),
            'helpful_score': parse_helpful(review['helpful']),
            'review_text': review['review'] # Keep text just in case you want it later
        })

In [8]:
df = pd.DataFrame(rows)
df = df.dropna(subset=['date'])
df = df.sort_values(by='date')

print(f"Total interactions: {len(df)}")
df.head()

Total interactions: 59280


Unnamed: 0,user_id,item_id,recommend,date,helpful_score,review_text
20764,TheWhipster,39690,1,2010-10-16,0.4,You'll Be Emo'd by the end of it!
29168,eddy96,40700,1,2010-10-25,0.0,A very addictive puzzle game with incredible a...
29807,farsel,24010,1,2010-11-19,0.416667,this is the most epic of epics and these train...
29801,farsel,15320,1,2010-11-20,0.888889,brill plane simulator where you can do some si...
29804,farsel,25700,1,2010-11-20,0.714286,"im mad about balls, and this game"
