In [1]:
import os

from pathlib import Path
import gc

import pandas as pd

import utilities.utilities as util

pd.set_option('display.max_columns', 500)

import json
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv('config.env'))

JSON_LOGS_PATH = Path(os.environ.get('JSON_DATASET'))

  from tqdm.autonotebook import trange, tqdm


### Verify `fastparquet` is installed
We will need `fastparquet` to store Pandas DataFrame with string categorial data. Note: `pyarrow` will not support this!
If the following line crashes, install `snappy` and `fastparquet` through conda by executing: `conda install -c conda-forge python-snappy fastparquet snappy`

In [2]:
pd.io.parquet.get_engine('fastparquet')  # if this line crash, run 'conda install -c conda-forge python-snappy fastparquet snappy'

<pandas.io.parquet.FastParquetImpl at 0x7f9fbc5c5ca0>

In [3]:
pd.show_versions()  # Verify: 'fastparquet : 0.5.0'


INSTALLED VERSIONS
------------------
commit           : b5958ee1999e9aead1938c0bba2b674378807b3d
python           : 3.8.6.final.0
python-bits      : 64
OS               : Linux
OS-release       : 4.15.0-112-generic
Version          : #113-Ubuntu SMP Thu Jul 9 23:41:39 UTC 2020
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : en_US.UTF-8
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.1.5
numpy            : 1.19.5
pytz             : 2020.5
dateutil         : 2.8.1
pip              : 20.3.3
setuptools       : 49.6.0.post20210108
Cython           : 0.29.21
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.6.3
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.19.0
pandas_datareader: None
bs4              : 4.9.3
bottleneck  

In [4]:
# Force 'fastparquet'
pd.set_option("io.parquet.engine", 'fastparquet')  

# Creating the Metatable

In [5]:
# Change paths accordingly
# logs_path = Path('D:') / 'logs'
output_path = Path('meta_tables')
output_path.mkdir(parents=True, exist_ok=True)

### Generate MetaTable for each Year
We will create a checkpoint for each year as a protection against running out of memory.

In [6]:
years = util.get_all_logs_annually(JSON_LOGS_PATH)
rows = []
for year, logs in years:
    for log_json in logs:

        log = json.load(log_json.open())

        previous_dealer = 0  # Retained state per round
        round_wind = 0
        seat_wind = 0  # From player 0's POV

        for round_number, actions in enumerate(log['rounds']):

            init = actions.pop(0)
            if init['tag'] != 'INIT':
                raise Exception(f"{log_json.name} does not have INIT!")

            # Check if dealership has been transferred
            current_dealer = int(init['data']['oya'])
            if previous_dealer != current_dealer:
                previous_dealer = current_dealer

                # Check if game has completed a full circle
                if current_dealer == 0:
                    round_wind += 1
                    if round_wind > 3:
                        round_wind = 0
                # round_wind, seat_wind = next_seat_wind(round_wind, seat_wind)

            # Check if there's any winners or exhaustive/abortive draw
            winner = -1
            if actions[-1]['tag'] == 'AGARI':
                winner = actions[-1]['data']['winner']

            # Scores
            scores = init['data']['scores']
            end_scores = actions[-1]['data']['scores']

            # Row Creation
            rows.append({
                'log_id': log_json.stem,
                'round': round_number,

                'round_wind': round_wind,
                'dealer': current_dealer,
                'winner': winner,

                # 'seat_wind': seat_wind,

                'honba': init['data']['combo'],
                'riichibo': init['data']['reach'],


                'p0_start_score': scores[0],
                'p1_start_score': scores[1],
                'p2_start_score': scores[2],
                'p3_start_score': scores[3],

                'p0_end_score': end_scores[0],
                'p1_end_score': end_scores[1],
                'p2_end_score': end_scores[2],
                'p3_end_score': end_scores[3],
            })

df = pd.DataFrame(rows)
df

<generator object get_all_logs_annually at 0x7f9ea35e94a0>


2009:   0%|          | 0/80156 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Merge all MetaTables into a single DataFrame

In [None]:
df.memory_usage(deep=True)

In [None]:
df.info()

#### Change Column Type to lower memory usage
We want to lower the memory usage by our Merged MetaTable DataFrame, from about 2 GB -> 1 GB.

In [None]:
df.reset_index(inplace=True)

df['log_id'] = df['log_id'].astype('category')
df['round'] = df['round'].astype('category')

categorical_cols = ['round_wind', 'dealer', 'winner']
df[categorical_cols] = df[categorical_cols].astype('category')

cols = ['honba', 'riichibo']
for col in cols:
    df[col] = pd.to_numeric(df[col], downcast='unsigned')

for i in range(4):
    df[f'p{i}_start_score'] = pd.to_numeric(df[f'p{i}_start_score'], downcast='integer')
    df[f'p{i}_end_score'] = pd.to_numeric(df[f'p{i}_end_score'], downcast='integer')

df.set_index(['log_id', 'round'], inplace=True)  # Create MultiIndex

In [None]:
df.memory_usage(deep=True)

In [None]:
df.info()

In [None]:
df.to_parquet(output_path / 'log_round_meta.parquet', engine='fastparquet')

In [9]:
df = pd.read_parquet(Path('log_round_meta.parquet'), engine='fastparquet')

In [12]:
df.to_csv(output_path / 'log_round_meta.csv')

### Test if newly created optimized version works

In [None]:
accumulated = pd.read_parquet(Path('E:') / 'mahjong' / 'pandas' / 'log_round_meta.parquet', engine='fastparquet')

In [None]:
accumulated.info()

In [None]:
accumulated.loc['2019123123gm-00e1-0000-f7f33877', 3]  # Accessing MultiIndexed

In [None]:
accumulated.shape

In [None]:
accumulated