In [1]:
import pandas as pd
import ast
import os

from bs4.diagnose import lxml_trace

DATA_DIR = "../data/romonitor_data/"
OUTPUT_DIR = "../data/romonitor_data_clean/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_timeseries_from_data(file, value_colname):
    df = pd.read_csv(DATA_DIR + file)
    # Only keep rows with a real 'name' (not metadata)
    df = df[df['name'].notna()]
    # Parse the dict in 'data'
    data_dict = ast.literal_eval(df.iloc[0]['data'])
    # Convert to DataFrame
    ts_df = pd.DataFrame(list(data_dict.items()), columns=['date', value_colname])
    ts_df['date'] = pd.to_datetime(ts_df['date']).dt.strftime('%Y-%m-%d')
    ts_df = ts_df.sort_values('date').reset_index(drop=True)
    ts_df.to_csv(OUTPUT_DIR + value_colname + '.csv', index=False)
    print(f"Saved: {OUTPUT_DIR}{value_colname}.csv")
    return ts_df

def extract_and_sum_popular_games():
    df = pd.read_csv(DATA_DIR + 'popular_games.csv')
    df = df[df['name'].notna()]
    game_dfs = []
    for _, row in df.iterrows():
        data_raw = row['data']
        try:
            data_dict = ast.literal_eval(data_raw)
        except:
            continue  
        if isinstance(data_dict, dict):
            game_df = pd.DataFrame(list(data_dict.items()), columns=['date', 'value'])
            game_df['date'] = pd.to_datetime(game_df['date'])
            game_dfs.append(game_df)
    if not game_dfs:
        print("No games with usable data found!")
        return None
    all_games = pd.concat(game_dfs)
    sum_games = all_games.groupby(all_games['date'].dt.strftime('%Y-%m-%d'))['value'].sum().reset_index()
    sum_games = sum_games.rename(columns={'value': 'popular_games_total'})
    sum_games.to_csv(OUTPUT_DIR + 'popular_games_total.csv', index=False)
    print(f"Saved: {OUTPUT_DIR}popular_games_total.csv")
    return sum_games

# Clean and save time series
extract_timeseries_from_data('ccu.csv', 'ccu')
extract_timeseries_from_data('registrations.csv', 'registrations')
extract_timeseries_from_data('session_length.csv', 'session_length')
extract_and_sum_popular_games()

Saved: ../data/romonitor_data_clean/ccu.csv
Saved: ../data/romonitor_data_clean/registrations.csv
Saved: ../data/romonitor_data_clean/session_length.csv
Saved: ../data/romonitor_data_clean/popular_games_total.csv


Unnamed: 0,date,popular_games_total
0,2025-05-06,79758549.0
1,2025-05-07,79997871.0
2,2025-05-08,81165810.0
3,2025-05-09,89288600.0
4,2025-05-10,116785967.0
5,2025-05-11,114392357.0
6,2025-05-12,79589010.0
7,2025-05-13,75347467.0
8,2025-05-14,75645524.0
9,2025-05-15,75085390.0


In [2]:
def clean_quarterly_fundamental(filepath, prefix=None):
    # Load and transpose so dates are rows, metrics are columns
    df = pd.read_csv(filepath, index_col=0).transpose()

    df = df.reset_index().rename(columns={'index': 'date'})
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    if prefix:
        newcols = ['date'] + [f"{prefix}{col}" for col in df.columns if col != 'date']
        df.columns = newcols
    df = df.sort_values('date').reset_index(drop=True)
    return df

# EXAMPLES:
# Cleaned income statement
qis = clean_quarterly_fundamental("../data/RBLX_quarterly_income_statement.csv", prefix="qis_")
qis.to_csv("../data/cleaned_quarterly_income_statement.csv", index=False)

# Cleaned balance sheet
qbs = clean_quarterly_fundamental("../data/RBLX_quarterly_balance_sheet.csv", prefix="qbs_")
qbs.to_csv("../data/cleaned_quarterly_balance_sheet.csv", index=False)

# Cleaned cash flow
qcf = clean_quarterly_fundamental("../data/RBLX_quarterly_cashflow.csv", prefix="qcf_")
qcf.to_csv("../data/cleaned_quarterly_cashflow.csv", index=False)

In [3]:
# Parsing and extracting SEC file data 

# Refer to sec_file_cleaning.py

In [4]:
# Merge All the data together

import pandas as pd

def standardize_date_column(df):
    # Rename any column with 'date' (case-insensitive) to 'date'
    for col in df.columns:
        if col.lower() == 'date':
            df = df.rename(columns={col: 'date'})
            break
    return df

def move_to_monday(date):
    if date.weekday() == 5:  # Saturday
        return date + pd.Timedelta(days=2)
    elif date.weekday() == 6:  # Sunday
        return date + pd.Timedelta(days=1)
    else:
        return date

# --- Load Data ---
stock = pd.read_csv("../data/RBLX_with_technicals.csv")
sp500 = pd.read_csv("../data/SP500.csv")
nasdaq = pd.read_csv("../data/Nasdaq.csv")
popular_games_total = pd.read_csv("../data/romonitor_data_clean/popular_games_total.csv")
registrations = pd.read_csv("../data/romonitor_data_clean/registrations.csv")
ccu = pd.read_csv("../data/romonitor_data_clean/ccu.csv")
session_length = pd.read_csv("../data/romonitor_data_clean/session_length.csv")
sentiment = pd.read_csv("../data/clean_news_sentiment.csv")
market = pd.read_csv("../data/market_context_data.csv")
sec = pd.read_csv("../data/sec_filings_features.csv")
qbs = pd.read_csv("../data/cleaned_quarterly_balance_sheet.csv")
qcf = pd.read_csv("../data/cleaned_quarterly_cashflow.csv")
qis = pd.read_csv("../data/cleaned_quarterly_income_statement.csv")

# --- Standardize Columns and Dates ---
sp500 = standardize_date_column(sp500)
nasdaq = standardize_date_column(nasdaq)
sp500 = sp500.rename(columns={col: f"SP500_{col}" for col in sp500.columns if col != "date"})
nasdaq = nasdaq.rename(columns={col: f"Nasdaq_{col}" for col in nasdaq.columns if col != "date"})

# --- Aggregating sentiment scores ---
sentiment['date'] = pd.to_datetime(sentiment['date'], errors='coerce')
sentiment['date'] = sentiment['date'].apply(move_to_monday)
sentiment = sentiment.groupby('date', as_index=False)[['neg', 'neu', 'pos', 'compound']].mean()

dfs = [
    stock, sp500, nasdaq, popular_games_total, registrations,
    ccu, session_length, sentiment, market, sec
]
dfs = [standardize_date_column(df) for df in dfs]

# Add quarterly dfs to standardize and convert dates too (but don't merge yet)
quarterly_dfs = [qbs, qcf, qis]
quarterly_dfs = [standardize_date_column(df) for df in quarterly_dfs]

# Ensure all 'date' columns are datetime type
for df in dfs + quarterly_dfs:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

# --- Merge All Daily DataFrames ---
master = dfs[0]
for df in dfs[1:]:
    master = master.merge(df, on='date', how='outer')

# --- Forward-fill Quarterly Data to Daily Rows ---
for quarterly_df in quarterly_dfs:
    quarterly_df = quarterly_df.sort_values('date')
    master = pd.merge_asof(master.sort_values('date'), quarterly_df, on='date', direction='backward')

# --- Filter to IPO date or later ---
ipo_date = pd.to_datetime("2021-03-10")
master = master[master["date"] >= ipo_date]

# --- Final Touches ---
master = master.sort_values('date').reset_index(drop=True)
master.to_csv("../data/master_dataset.csv", index=False)
print("Master dataset saved as master_dataset.csv (with quarterly features forward-filled)")

Master dataset saved as master_dataset.csv (with quarterly features forward-filled)
