# Count Rows
The following code counts number of rows of our discard dataset

In [17]:
import os
from pathlib import Path

import numpy as np
import scipy.sparse

from tqdm import tqdm

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv('config.env'))

DATASET_PATH = Path(os.environ.get('DISCARD_DATASET'))

In [18]:
for i in DATASET_PATH.iterdir():
    print(i)

/home/jovyan/MasterThesis/data/discard_datasets/2018
/home/jovyan/MasterThesis/data/discard_datasets/2017
/home/jovyan/MasterThesis/data/discard_datasets/2019
/home/jovyan/MasterThesis/data/discard_datasets/2016
/home/jovyan/MasterThesis/data/discard_datasets/.ipynb_checkpoints


In [19]:
def generate_phase_column(array: np.array) -> np.array:    
    
    # Begin with merging all pools together
    merged_discards = array[:, 238:]  # Discards
    merged_discards = np.sum(merged_discards, axis=1)

    phases = np.zeros([array.shape[0]])  # Early Game
    phases[(24 < merged_discards) & (merged_discards <= 48)] = 1  # Mid Game
    phases[(48 < merged_discards)] = 2  # End Game

    return array[(phases == 0)], array[(phases == 1)], array[(phases == 2)]

In [20]:
phase_discards = {}

for year in DATASET_PATH.iterdir():
    if year.stem != ".ipynb_checkpoints":
        
        current_phase_discards = np.zeros(3, dtype=int)
        
        sparse_files = list(year.iterdir())
        for game in tqdm(sparse_files, total=len(sparse_files), desc=year.stem):
            
            rows = scipy.sparse.load_npz(game).toarray()
            phase_arrays = generate_phase_column(rows)
            
            current_phase_discards[0] += phase_arrays[0].shape[0]
            current_phase_discards[1] += phase_arrays[1].shape[0]
            current_phase_discards[2] += phase_arrays[2].shape[0]
            
        phase_discards[int(year.stem)] = current_phase_discards

2018: 100%|██████████| 173548/173548 [06:10<00:00, 468.22it/s]
2017: 100%|██████████| 169384/169384 [05:56<00:00, 475.60it/s]
2019: 100%|██████████| 171629/171629 [05:48<00:00, 492.64it/s]
2016: 100%|██████████| 159665/159665 [05:38<00:00, 471.27it/s]


# To DataFrame
Good for LaTeX table if needed.

Documentation: [pandas.DataFrame.to_latex()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_latex.html)

In [81]:
import pandas as pd

df = pd.DataFrame.from_dict(phase_discards, orient="index", columns=['phase_0', 'phase_1', 'phase_2'])
df['total'] = df.sum(axis=1)
df = df.sort_index(axis=0)
df

Unnamed: 0,phase_0,phase_1,phase_2,total
2016,10766723,38447335,28420353,77634411
2017,11409454,40751004,30250860,82411318
2018,11596765,41775222,30798003,84169990
2019,11406683,41303327,30340369,83050379


In [82]:
print(df.to_latex(float_format='.2f'))

\begin{tabular}{lrrrr}
\toprule
{} &   phase\_0 &   phase\_1 &   phase\_2 &     total \\
\midrule
2016 &  10766723 &  38447335 &  28420353 &  77634411 \\
2017 &  11409454 &  40751004 &  30250860 &  82411318 \\
2018 &  11596765 &  41775222 &  30798003 &  84169990 \\
2019 &  11406683 &  41303327 &  30340369 &  83050379 \\
\bottomrule
\end{tabular}

