In [7]:
import os
from os.path import join
import cudf
import pickle
import pandas as pd

datasets = os.path.join(os.getcwd(), "datasets")
raw_datasets = os.path.join(datasets, "raw")
!du -h datasets/raw/*

test_data = pickle.load(open(os.path.join(raw_datasets, "test.pickle"), 'rb'))
(train_data_1, train_labels_1_nd) = pickle.load(open(os.path.join(raw_datasets, "train1.pickle"), "rb"))
(train_data_2, train_labels_2_nd) = pickle.load(open(os.path.join(raw_datasets, "train2.pickle"), "rb"))

train_labels_1 = pd.DataFrame(train_labels_1_nd, columns=["output"])
train_labels_2 = pd.DataFrame(train_labels_2_nd, columns=["output"])

2,1G	datasets/raw/all_merged_df.pickle
514M	datasets/raw/test.pickle
748M	datasets/raw/train1.pickle
793M	datasets/raw/train2.pickle


Merge all training dataset (we will use *train_test_split* for validation later)
---

In [8]:
import pickle

X_train_df = pd.concat([train_data_1, train_data_2])
y_train_df = pd.concat([train_labels_1, train_labels_2])
all_merged = (X_train_df, y_train_df, test_data)

pickle.dump(all_merged, open(join(raw_datasets, "all_merged_df.pkl"), 'wb+'))

Output the training as a csv
---

In [9]:
# Output as csv to avoid pickle loading pandas
csv_datasets = os.path.join(datasets, "csv")
!mkdir -p datasets/csv

X_train_df.to_csv(join(csv_datasets, "X_train.csv"), index=False)
y_train_df.to_csv(join(csv_datasets, "y_train.csv"), index=False)
test_data.to_csv(join(csv_datasets, "X_test.csv"), index=False)

KeyboardInterrupt: 

Transform *pandas DataFrame* as a *cudf DataFrame* and dumps them
---

In [None]:
X_train_cudf, y_train_cudf = cudf.DataFrame.from_pandas(X_train_df), cudf.DataFrame.from_pandas(y_train_df)
X_test_cudf = cudf.DataFrame.from_pandas(test_data)
all_merged_cudf = (X_train_cudf, y_train_cudf, X_test_cudf)

pickle.dump(all_merged_cudf, open(join(raw_datasets, "all_merged_cudf.pkl"), "wb+"))

In [7]:

_all = dict(train_1 = (train_data_1, train_labels_1), train_2 = (train_data_2, train_labels_2), test = (test_data, None))
for name, (X, y) in _all.items():
    print("="*25, name, "="*25)
    X.describe()
    y.describe()



KeyboardInterrupt: 