In [1]:
import pandas as pd
import numpy as np

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
import ticcl_output_reader

In [3]:
def load_cpp():
    cpp_index = ticcl_output_reader.load_confuslist_index("pandas_loading_benchmarks_data.txt")
    df = pd.DataFrame.from_records({"key": cpp_index[0],
                                   "list_index": cpp_index[1],
                                   "value": cpp_index[2]}, index=["key", "list_index"])
    return df

In [4]:
def load_pandas_apply():
    df_tuples = pd.read_csv('pandas_loading_benchmarks_data.txt',
                        sep='#', index_col=0, names=['key', 'values'],
                        converters={'values': lambda w: tuple(w.split(','))})
    df = df_tuples['values'].apply(pd.Series, 1).stack().astype('uint64').to_frame()
    df.index.rename(["key", "list_index"], inplace=True)
    df.rename({0: 'value'}, axis='columns', inplace=True)
    return df

In [5]:
%timeit load_cpp()

7.69 ms ± 486 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%timeit load_pandas_apply()

354 ms ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
360/7

51.42857142857143

In [8]:
df_cpp = load_cpp()
df_pandas = load_pandas_apply()

In [9]:
df_pandas.equals(df_cpp)

True

Redditor [pvkooten](https://www.reddit.com/user/pvkooten) came up with the suggestion to do it in pure Python, let's try that, with some slight modifications.

In [10]:
def load_python():
    with open('pandas_loading_benchmarks_data.txt') as f:
        dc = {}
        for line in f:
            if not line:
                continue
            key, _, value = line.partition("#")
            values = value.rstrip("\n").split(",")
            dc[int(key)] = values
    return dc

In [11]:
%timeit load_python()

607 µs ± 41.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


That's fast! But this is not yet the regularized DataFrame format we have for the other two ways of loading. Converting takes a lot of time:

In [13]:
def load_python_df():
    with open('pandas_loading_benchmarks_data.txt') as f:
        dc = {}
        for line in f:
            if not line:
                continue
            key, _, value = line.partition("#")
            values = value.rstrip("\n").split(",")
            dc[int(key)] = values
    df = pd.DataFrame.from_dict(dc, orient='index').stack().astype('uint64').to_frame()
    df.index.rename(["key", "list_index"], inplace=True)
    df.rename({0: 'value'}, axis='columns', inplace=True)
    return df

In [14]:
df_py = load_python_df()

In [15]:
df_pandas.equals(df_py)

True

Ok, then we time that...

In [16]:
%timeit load_python_df()

395 ms ± 85.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


That's a bit slower than the Pandas way.