In [4]:
import torch
torch.DoubleTensor().float()

tensor([])

In [1]:
from sys import getsizeof
import os

import pandas as pd
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
preprocessed = pd.read_parquet("../data/train_preprocessed/batch_0.parquet")
target = preprocessed["target"]
preprocessed = preprocessed.drop("target", axis=1)
preprocessed.to_numpy().shape

(186963, 7)

In [2]:
attributes = pd.read_parquet("../data/train/attributes.parquet")
# resnet = pd.read_parquet("../data/train/resnet.parquet")
text_and_bert = pd.read_parquet("../data/train/text_and_bert.parquet")
train = pd.read_parquet("../data/train/train.parquet")

KeyboardInterrupt: 

In [3]:
def get_size_in_gb(var):
    size = getsizeof(var)
    return size / 2 ** 30

In [4]:
print(get_size_in_gb(attributes))
print(get_size_in_gb(text_and_bert))
print(get_size_in_gb(train))

3.292391447350383
3.8826297046616673
0.02611852064728737


## Маленький анализ

In [13]:
train, test = train_test_split(train, test_size=0.2, stratify=train.target)
print(train.target.value_counts())
print(test.target.value_counts())

target
0    248490
1    230133
Name: count, dtype: int64
target
0    62123
1    57533
Name: count, dtype: int64


In [16]:
train.reset_index(drop=True).to_csv("../data/train_batched/train.csv")
test.reset_index(drop=True).to_csv("../data/train_batched/test.csv")

In [18]:
train

Unnamed: 0,variantid1,variantid2,target
853088,1434353901,1434351740,0
435608,1179662401,1179731979,0
212398,1543330364,1385896130,0
733422,1448912941,1451715340,1
1109444,975589316,1197948269,0
...,...,...,...
624920,386492071,1221313151,1
869794,630746406,1169230998,0
816615,951680945,671264098,0
640201,768589465,587558046,1


In [2]:
def _split_df_pairs_batches(df: pd.DataFrame, n_batches: int, shuffle: bool = True):
    # splits df in given number batches
    if shuffle: 
        df = df.sample(frac=1).reset_index(drop=True)
    out = []
    for part in np.array_split(df.index, n_batches):
        out.append(df.iloc[part])
    return out

def _get_unique_ids(df_pairs: pd.DataFrame):
    # gets all unique variantid that are used in df with target
    return np.unique(np.concatenate([df_pairs["variantid1"], df_pairs["variantid2"]]))

def _get_batch_data(df_pairs: pd.DataFrame, attributes: pd.DataFrame, text_and_bert: pd.DataFrame):
    unique_ids = _get_unique_ids(df_pairs)
    return attributes[attributes.variantid.isin(unique_ids)], text_and_bert[text_and_bert.variantid.isin(unique_ids)]

def split_train(path: str, save_path: str, n_batches: int = 5):
    train_pairs = pd.read_parquet(os.path.join(path, "train.parquet"))
    attributes = pd.read_parquet(os.path.join(path, "attributes.parquet"))
    text_and_bert = pd.read_parquet(os.path.join(path, "text_and_bert.parquet"))

    # сплитим на тест и на трейн, считаем, что тест должен влезть в память :)
    train, test = train_test_split(train, test_size=0.2, stratify=train.target)
    
    test.reset_index(drop=True).to_parquet(os.path.join(save_path, "test.parquet"))
    
    # save test
    test_atrribute, test_text_and_bert = _get_batch_data(test, attributes, text_and_bert)
    test_atrribute.to_parquet(os.path.join(save_path, "test_atrributes.parquet"))
    test_text_and_bert.to_parquet(os.path.join(save_path, "test_text_and_bert.parquet"))
    del test_atrribute
    del test_text_and_bert
    
    batches = _split_df_pairs_batches(train, n_batches)
    
    for i, batch in enumerate(batches):
        batch_attribute, batch_text_and_bert = _get_batch_data(batch, attributes, text_and_bert)
        batch_attribute.to_parquet(os.path.join(save_path, f"batch{i}_attributes.parquet"))
        batch_text_and_bert.to_parquet(os.path.join(save_path, f"batch{i}_text_and_bert.parquet"))
        

train = pd.read_parquet("../data/train/train.parquet")
batches = _split_df_pairs_batches(train, 5)
_batch1 = batches[0]

In [3]:
_unique_ids = _get_unique_ids(_batch1)

In [6]:
_batch1_attribute = attributes[attributes.variantid.isin(_unique_ids)]
get_size_in_gb(_batch1_attribute)

NameError: name 'get_size_in_gb' is not defined

In [4]:
_unique_ids.shape

(458765,)

In [5]:
import polars as pl

In [6]:
_unique_ids

array([  47598998,   47601846,   47613863, ..., 1564722376, 1564723632,
       1564724243])

In [7]:

resnet_pq = pl.scan_parquet('../data/train/resnet.parquet')
resnet_pq.schema

  resnet_pq.schema


Schema([('variantid', Int64),
        ('main_pic_embeddings_resnet_v1', List(List(Float64))),
        ('pic_embeddings_resnet_v1', List(List(Float64)))])

In [9]:
result = resnet_pq.filter(pl.col("variantid").is_in(_unique_ids[:_unique_ids.shape[0] // 2])).collect()

In [11]:
def _add_to_parquet(parquet_path: str, df: pd.DataFrame):
    if os.path.exists(parquet_path):
        # append mode
        df.to_parquet(parquet_path, engine="fastparquet", append=True)
    else:
        # new file
        df.to_parquet(parquet_path, engine="fastparquet")

pyarrow = result.to_arrow()

In [13]:
type(pyarrow)

pyarrow.lib.Table

In [17]:
pyarrow.schema

variantid: int64
main_pic_embeddings_resnet_v1: large_list<item: large_list<item: double>>
  child 0, item: large_list<item: double>
      child 0, item: double
pic_embeddings_resnet_v1: large_list<item: large_list<item: double>>
  child 0, item: large_list<item: double>
      child 0, item: double

In [19]:
pq_writter = pq.ParquetWriter('../data/tmp.parquet', schema=pyarrow.schema)

In [21]:
pyarrow.shape

(229382, 3)

In [22]:
pq_writter.write_table(pyarrow)

In [24]:
pq_writter.close()

In [26]:
type(pyarrow)

pyarrow.lib.Table

In [25]:
parquet_file = pq.ParquetFile('../data/tmp.parquet')
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x7f9ca4fb34c0>
  created_by: parquet-cpp-arrow version 17.0.0
  num_columns: 3
  num_rows: 458764
  num_row_groups: 2
  format_version: 2.6
  serialized_size: 1889

In [4]:
parquet_file = pq.ParquetFile('../data/train/resnet.parquet')
pl


for batch in parquet_file.iter_batches(batch_size=4):
    print("RecordBatch")
    batch_df = batch.to_pandas()
    print("batch_df:", batch_df)
    break

RecordBatch
batch_df:    variantid                      main_pic_embeddings_resnet_v1  \
0   47920382  [[0.8170074820518494, 0.9416620135307312, 0.31...   
1   49801845  [[-0.43339717388153076, -0.17318281531333923, ...   
2   49853444  [[0.11314830183982849, -0.34010639786720276, -...   
3   49893028  [[0.25037717819213867, 0.33753663301467896, 0....   

                            pic_embeddings_resnet_v1  
0  [[0.20931944251060486, -0.29257065057754517, -...  
1                                               None  
2                                               None  
3                                               None  


In [6]:
get_size_in_gb(batch_df)

8.083879947662354e-07

In [4]:
pd.read_parquet("../data/train_batched/batch1.parquet")

Unnamed: 0,variantid1,variantid2,target
186963,1504231674,1482740487,1
186964,733771694,298403096,0
186965,1554489495,1480846705,1
186966,1029342474,426500702,1
186967,1331337846,1334075274,0
...,...,...,...
373921,1502158839,1502158904,0
373922,1219136532,976312541,1
373923,1037365436,1037365579,0
373924,372116797,949616866,1
