This repository includes scripts for Tabbie(Tabular Information Embedding) model. The link to the paper is as follows. https://arxiv.org/pdf/2105.02584.pdf
# https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-base.html
sudo rm /usr/local/cuda
sudo ln -s /usr/local/cuda-10.1 /usr/local/cuda
git clone https://github.com/SFIG611/tabbie.git
cd tabbie
conda env create --file env/env.yml
conda activate table_emb_dev
pip install torch==1.5.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
conda activate table_emb_dev
mkdir -p third_party
git clone -q https://github.com/NVIDIA/apex.git third_party/apex
cd third_party/apex
export TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5"
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" ./
https://drive.google.com/drive/folders/1vAMv09j-VlWHKd5djiRGuC16yb-lhJO0
mv freq.tar.gz mix.tar.gz tabbie/model
conda activate table_emb_dev
python pretrain_pred.py # default input data is "data/pretrain/sample.jsonl"
- id (str): identical value for table
- faked_cells (2d list, row_id/col_id=0,1,2...): [[row_id1, col_id1, "corrupt_cell1"], [row_id2, col_id2, "corrupt_cell2"], ...]
- faked_headers (2d list, col_id=0,1,2...): [[col_id1, "corrupt_header1"], [col_id2, "corrupt_header2"], ...]
conda activate table_emb_dev
cd tabbie
python train.py --train_csv_dir ./data/ft_cell/train_csv --train_label_path ./data/ft_cell/train_label.csv
python pred.py --test_csv_dir ./data/ft_cell/test_csv --model_path ./out_model/model.tar.gz
conda activate table_emb_dev
cd tabbie
python train.py --train_csv_dir ./data/ft_col/train_csv --train_label_path ./data/ft_col/train_label.csv
python pred.py --test_csv_dir ./data/ft_col/test_csv --model_path ./out_model/model.tar.gz
conda activate table_emb_dev
cd tabbie
python train.py --train_csv_dir ./data/ft_table/train_csv --train_label_path ./data/ft_table/train_label.csv
python pred.py --test_csv_dir ./data/ft_table/test_csv --model_path ./out_model/model.tar.gz